[llvm] r323201 - [X86] Legalize v32i1 without BWI via splitting to v16i1 rather than the default of promoting to v32i8.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 23 06:25:39 PST 2018
Author: ctopper
Date: Tue Jan 23 06:25:39 2018
New Revision: 323201
URL: http://llvm.org/viewvc/llvm-project?rev=323201&view=rev
Log:
[X86] Legalize v32i1 without BWI via splitting to v16i1 rather than the default of promoting to v32i8.
Summary:
For the most part its better to keep v32i1 as a mask type of a narrower width than trying to promote it to a ymm register.
I had to add some overrides to the methods that get the types for the calling convention so that we still use v32i8 for argument/return purposes.
There are still some regressions in here. I definitely saw some around shuffles. I think we probably should move vXi1 shuffle from lowering to a DAG combine where I think the extend and truncate we have to emit would be better combined.
I think we also need a DAG combine to remove trunc from (extract_vector_elt (trunc))
Overall this removes something like 13000 CHECK lines from lit tests.
Reviewers: zvi, RKSimon, delena, spatel
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D42031
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.h
llvm/trunk/test/CodeGen/X86/avg-mask.ll
llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll
llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll
llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll
llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll
llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll
llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
llvm/trunk/test/CodeGen/X86/vector-compare-results.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Jan 23 06:25:39 2018
@@ -1720,6 +1720,9 @@ SDValue X86TargetLowering::emitStackGuar
TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ return TypeSplitVector;
+
if (ExperimentalVectorWideningLegalization &&
VT.getVectorNumElements() != 1 &&
VT.getVectorElementType().getSimpleVT() != MVT::i1)
@@ -1728,6 +1731,26 @@ X86TargetLowering::getPreferredVectorAct
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+MVT X86TargetLowering::getRegisterTypeForCallingConv(MVT VT) const {
+ if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ return MVT::v32i8;
+ return TargetLowering::getRegisterTypeForCallingConv(VT);
+}
+
+MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ EVT VT) const {
+ if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ return MVT::v32i8;
+ return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
+}
+
+unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+ EVT VT) const {
+ if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ return 1;
+ return TargetLowering::getNumRegistersForCallingConv(Context, VT);
+}
+
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Tue Jan 23 06:25:39 2018
@@ -1084,6 +1084,14 @@ namespace llvm {
/// \brief Customize the preferred legalization strategy for certain types.
LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+ MVT getRegisterTypeForCallingConv(MVT VT) const override;
+
+ MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+ EVT VT) const override;
+
+ unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+ EVT VT) const override;
+
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
bool supportSwiftError() const override;
Modified: llvm/trunk/test/CodeGen/X86/avg-mask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg-mask.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avg-mask.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avg-mask.ll Tue Jan 23 06:25:39 2018
@@ -60,22 +60,16 @@ define <16 x i8> @avg_v16i8_maskz(<16 x
define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i8_mask:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: andq $-32, %rsp
-; AVX512F-NEXT: subq $32, %rsp
-; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: kmovw (%rsp), %k1
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: movq %rbp, %rsp
-; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i8_mask:
@@ -98,22 +92,16 @@ define <32 x i8> @avg_v32i8_mask(<32 x i
define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i8_maskz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: andq $-32, %rsp
-; AVX512F-NEXT: subq $32, %rsp
-; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: kmovw (%rsp), %k1
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: movq %rbp, %rsp
-; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i8_maskz:
@@ -135,33 +123,30 @@ define <32 x i8> @avg_v32i8_maskz(<32 x
define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
; AVX512F-LABEL: avg_v64i8_mask:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: andq $-32, %rsp
-; AVX512F-NEXT: subq $64, %rsp
-; AVX512F-NEXT: movl %edi, (%rsp)
-; AVX512F-NEXT: shrq $32, %rdi
-; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq %rdi, %rcx
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: movl %edi, %edx
+; AVX512F-NEXT: shrl $16, %edx
+; AVX512F-NEXT: shrq $32, %rax
+; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: kmovw (%rsp), %k1
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: kmovw %ecx, %k2
+; AVX512F-NEXT: kmovw %eax, %k3
+; AVX512F-NEXT: kmovw %edx, %k4
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: movq %rbp, %rsp
-; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_mask:
@@ -184,33 +169,30 @@ define <64 x i8> @avg_v64i8_mask(<64 x i
define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
; AVX512F-LABEL: avg_v64i8_maskz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: andq $-32, %rsp
-; AVX512F-NEXT: subq $64, %rsp
-; AVX512F-NEXT: movl %edi, (%rsp)
-; AVX512F-NEXT: shrq $32, %rdi
-; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq %rdi, %rcx
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: movl %edi, %edx
+; AVX512F-NEXT: shrl $16, %edx
+; AVX512F-NEXT: shrq $32, %rax
+; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: kmovw (%rsp), %k1
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: kmovw %ecx, %k2
+; AVX512F-NEXT: kmovw %eax, %k3
+; AVX512F-NEXT: kmovw %edx, %k4
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: movq %rbp, %rsp
-; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_maskz:
@@ -340,29 +322,17 @@ define <16 x i16> @avg_v16i16_maskz(<16
define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i16_mask:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: andq $-32, %rsp
-; AVX512F-NEXT: subq $32, %rsp
-; AVX512F-NEXT: movl %edi, (%rsp)
-; AVX512F-NEXT: kmovw (%rsp), %k1
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
-; AVX512F-NEXT: vpmovdb %zmm6, %xmm6
-; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
-; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
-; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512F-NEXT: kmovw %edi, %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
-; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
-; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: movq %rbp, %rsp
-; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_mask:
@@ -385,29 +355,17 @@ define <32 x i16> @avg_v32i16_mask(<32 x
define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i16_maskz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: andq $-32, %rsp
-; AVX512F-NEXT: subq $32, %rsp
-; AVX512F-NEXT: movl %edi, (%rsp)
-; AVX512F-NEXT: kmovw (%rsp), %k1
-; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
-; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
-; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
-; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512F-NEXT: kmovw %edi, %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
-; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: movq %rbp, %rsp
-; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_maskz:
Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Tue Jan 23 06:25:39 2018
@@ -782,39 +782,20 @@ define <32 x i8> @test_insert_128_v32i8(
define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) {
; KNL-LABEL: test_insertelement_v32i1:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $32, %rsp
-; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
-; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
-; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, (%rsp)
-; KNL-NEXT: movl (%rsp), %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
+; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: shll $16, %ecx
+; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0
+; KNL-NEXT: kshiftrw $4, %k0, %k1
+; KNL-NEXT: kmovw %eax, %k2
+; KNL-NEXT: kxorw %k2, %k1, %k1
+; KNL-NEXT: kshiftlw $15, %k1, %k1
+; KNL-NEXT: kshiftrw $11, %k1, %k1
+; KNL-NEXT: kxorw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: orl %ecx, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
@@ -1014,7 +995,10 @@ define zeroext i8 @test_extractelement_v
; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpextrb $2, %xmm0, %eax
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftrw $2, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@@ -1041,7 +1025,10 @@ define zeroext i8 @test_extractelement_v
; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrb $15, %xmm0, %eax
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
@@ -1074,7 +1061,10 @@ define zeroext i8 @extractelement_v64i1_
; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrb $15, %xmm0, %eax
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
@@ -1717,26 +1707,25 @@ define i32 @test_insertelement_variable_
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $96, %rsp
+; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
; KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; KNL-NEXT: andl $31, %esi
; KNL-NEXT: testb %dil, %dil
-; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: setne 32(%rsp,%rsi)
-; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vmovdqa %ymm0, (%rsp)
+; KNL-NEXT: setne (%rsp,%rsi)
+; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, (%rsp)
-; KNL-NEXT: movl (%rsp), %eax
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: shll $16, %eax
+; KNL-NEXT: orl %ecx, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
@@ -1782,7 +1771,7 @@ define i64 @test_insertelement_variable_
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $192, %rsp
+; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -1792,30 +1781,32 @@ define i64 @test_insertelement_variable_
; KNL-NEXT: andl $63, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: setne 64(%rsp,%rsi)
-; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT: vmovdqa %ymm0, (%rsp)
+; KNL-NEXT: setne (%rsp,%rsi)
+; KNL-NEXT: vmovdqa (%rsp), %ymm0
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: shll $16, %ecx
+; KNL-NEXT: orl %eax, %ecx
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, (%rsp)
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: movl (%rsp), %eax
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: shll $16, %eax
+; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: movq %rbp, %rsp
@@ -1863,7 +1854,7 @@ define i96 @test_insertelement_variable_
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-128, %rsp
-; KNL-NEXT: subq $384, %rsp ## imm = 0x180
+; KNL-NEXT: subq $256, %rsp ## imm = 0x100
; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm0, %xmm0
; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm0, %xmm0
@@ -1977,56 +1968,60 @@ define i96 @test_insertelement_variable_
; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
-; KNL-NEXT: setne 128(%rsp,%rax)
-; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
-; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
+; KNL-NEXT: vmovdqa %ymm2, (%rsp)
+; KNL-NEXT: setne (%rsp,%rax)
+; KNL-NEXT: vmovdqa (%rsp), %ymm2
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4
-; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm4
; KNL-NEXT: vpslld $31, %zmm4, %zmm4
; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: shll $16, %ecx
+; KNL-NEXT: orl %eax, %ecx
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm2
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: shll $16, %eax
+; KNL-NEXT: orl %edx, %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpmovsxbd %xmm3, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, (%rsp)
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: shll $16, %esi
+; KNL-NEXT: orl %ecx, %esi
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; KNL-NEXT: shlq $32, %rax
-; KNL-NEXT: orq %rcx, %rax
-; KNL-NEXT: movl (%rsp), %ecx
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: shll $16, %edx
+; KNL-NEXT: orl %ecx, %edx
; KNL-NEXT: shlq $32, %rdx
-; KNL-NEXT: orq %rcx, %rdx
+; KNL-NEXT: orq %rsi, %rdx
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
@@ -2178,7 +2173,7 @@ define i128 @test_insertelement_variable
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-128, %rsp
-; KNL-NEXT: subq $384, %rsp ## imm = 0x180
+; KNL-NEXT: subq $256, %rsp ## imm = 0x100
; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm4, %ymm0, %ymm0
@@ -2194,56 +2189,60 @@ define i128 @test_insertelement_variable
; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: setne 128(%rsp,%rsi)
-; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
-; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
+; KNL-NEXT: vmovdqa %ymm0, (%rsp)
+; KNL-NEXT: setne (%rsp,%rsi)
+; KNL-NEXT: vmovdqa (%rsp), %ymm2
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4
-; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm4
; KNL-NEXT: vpslld $31, %zmm4, %zmm4
; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: shll $16, %ecx
+; KNL-NEXT: orl %eax, %ecx
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm2
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: shll $16, %eax
+; KNL-NEXT: orl %edx, %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpmovsxbd %xmm3, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, (%rsp)
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: shll $16, %esi
+; KNL-NEXT: orl %ecx, %esi
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; KNL-NEXT: shlq $32, %rax
-; KNL-NEXT: orq %rcx, %rax
-; KNL-NEXT: movl (%rsp), %ecx
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: shll $16, %edx
+; KNL-NEXT: orl %ecx, %edx
; KNL-NEXT: shlq $32, %rdx
-; KNL-NEXT: orq %rcx, %rdx
+; KNL-NEXT: orq %rsi, %rdx
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Tue Jan 23 06:25:39 2018
@@ -975,38 +975,32 @@ define <64 x i8> @test16(i64 %x) {
;
; KNL-LABEL: test16:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: movl %edi, (%rsp)
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: movl %edi, %ecx
+; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: shrq $32, %rdi
-; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; KNL-NEXT: kmovw (%rsp), %k1
-; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT: shrq $48, %rax
+; KNL-NEXT: shrl $16, %ecx
+; KNL-NEXT: kmovw %ecx, %k1
+; KNL-NEXT: kmovw %eax, %k2
+; KNL-NEXT: kmovw %edi, %k3
+; KNL-NEXT: movb $1, %al
+; KNL-NEXT: kmovw %eax, %k4
+; KNL-NEXT: kshiftrw $5, %k0, %k5
+; KNL-NEXT: kxorw %k4, %k5, %k4
+; KNL-NEXT: kshiftlw $15, %k4, %k4
+; KNL-NEXT: kshiftrw $10, %k4, %k4
+; KNL-NEXT: kxorw %k0, %k4, %k4
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: movl $1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
-; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test16:
@@ -1037,38 +1031,32 @@ define <64 x i8> @test16(i64 %x) {
;
; AVX512DQ-LABEL: test16:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: pushq %rbp
-; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT: .cfi_offset %rbp, -16
-; AVX512DQ-NEXT: movq %rsp, %rbp
-; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp
-; AVX512DQ-NEXT: andq $-32, %rsp
-; AVX512DQ-NEXT: subq $64, %rsp
-; AVX512DQ-NEXT: movl %edi, (%rsp)
+; AVX512DQ-NEXT: movq %rdi, %rax
+; AVX512DQ-NEXT: movl %edi, %ecx
+; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: shrq $32, %rdi
-; AVX512DQ-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT: kmovw (%rsp), %k0
-; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT: shrq $48, %rax
+; AVX512DQ-NEXT: shrl $16, %ecx
+; AVX512DQ-NEXT: kmovw %ecx, %k1
+; AVX512DQ-NEXT: kmovw %eax, %k2
+; AVX512DQ-NEXT: kmovw %edi, %k3
+; AVX512DQ-NEXT: movb $1, %al
+; AVX512DQ-NEXT: kmovw %eax, %k4
+; AVX512DQ-NEXT: kshiftrw $5, %k0, %k5
+; AVX512DQ-NEXT: kxorw %k4, %k5, %k4
+; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4
+; AVX512DQ-NEXT: kxorw %k0, %k4, %k0
+; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
-; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT: movl $1, %eax
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k0
-; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
-; AVX512DQ-NEXT: movq %rbp, %rsp
-; AVX512DQ-NEXT: popq %rbp
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
%a = bitcast i64 %x to <64 x i1>
%b = insertelement <64 x i1>%a, i1 true, i32 5
@@ -1080,40 +1068,33 @@ define <64 x i8> @test17(i64 %x, i32 %y,
;
; KNL-LABEL: test17:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: movl %edi, (%rsp)
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: movl %edi, %ecx
+; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: shrq $32, %rdi
-; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; KNL-NEXT: kmovw (%rsp), %k1
-; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: shrq $48, %rax
+; KNL-NEXT: shrl $16, %ecx
+; KNL-NEXT: kmovw %ecx, %k1
+; KNL-NEXT: kmovw %eax, %k2
+; KNL-NEXT: kmovw %edi, %k3
; KNL-NEXT: cmpl %edx, %esi
; KNL-NEXT: setg %al
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: kmovw %eax, %k4
+; KNL-NEXT: kshiftrw $5, %k0, %k5
+; KNL-NEXT: kxorw %k4, %k5, %k4
+; KNL-NEXT: kshiftlw $15, %k4, %k4
+; KNL-NEXT: kshiftrw $10, %k4, %k4
+; KNL-NEXT: kxorw %k0, %k4, %k4
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
-; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test17:
@@ -1146,40 +1127,33 @@ define <64 x i8> @test17(i64 %x, i32 %y,
;
; AVX512DQ-LABEL: test17:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: pushq %rbp
-; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT: .cfi_offset %rbp, -16
-; AVX512DQ-NEXT: movq %rsp, %rbp
-; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp
-; AVX512DQ-NEXT: andq $-32, %rsp
-; AVX512DQ-NEXT: subq $64, %rsp
-; AVX512DQ-NEXT: movl %edi, (%rsp)
+; AVX512DQ-NEXT: movq %rdi, %rax
+; AVX512DQ-NEXT: movl %edi, %ecx
+; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: shrq $32, %rdi
-; AVX512DQ-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT: kmovw (%rsp), %k0
-; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
-; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT: xorl %eax, %eax
+; AVX512DQ-NEXT: shrq $48, %rax
+; AVX512DQ-NEXT: shrl $16, %ecx
+; AVX512DQ-NEXT: kmovw %ecx, %k1
+; AVX512DQ-NEXT: kmovw %eax, %k2
+; AVX512DQ-NEXT: kmovw %edi, %k3
; AVX512DQ-NEXT: cmpl %edx, %esi
; AVX512DQ-NEXT: setg %al
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k0
-; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT: kmovw %eax, %k4
+; AVX512DQ-NEXT: kshiftrw $5, %k0, %k5
+; AVX512DQ-NEXT: kxorw %k4, %k5, %k4
+; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4
+; AVX512DQ-NEXT: kxorw %k0, %k4, %k0
+; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
-; AVX512DQ-NEXT: movq %rbp, %rsp
-; AVX512DQ-NEXT: popq %rbp
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
%a = bitcast i64 %x to <64 x i1>
%b = icmp sgt i32 %y, %z
@@ -1815,51 +1789,29 @@ define void @ktest_2(<32 x float> %in, f
;
; KNL-LABEL: ktest_2:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $32, %rsp
; KNL-NEXT: vmovups (%rdi), %zmm2
; KNL-NEXT: vmovups 64(%rdi), %zmm3
-; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k1
-; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k2
-; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
-; KNL-NEXT: vpmovdb %zmm3, %xmm3
-; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
-; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
-; KNL-NEXT: vcmpltps %zmm5, %zmm0, %k1
-; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm5, %xmm5
-; KNL-NEXT: vpor %xmm5, %xmm2, %xmm2
-; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm4, %xmm4
-; KNL-NEXT: vpor %xmm4, %xmm3, %xmm3
-; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
-; KNL-NEXT: vpslld $31, %zmm3, %zmm3
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT: vpslld $31, %zmm2, %zmm2
-; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
-; KNL-NEXT: kmovw %k0, (%rsp)
-; KNL-NEXT: cmpl $0, (%rsp)
+; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1
+; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
+; KNL-NEXT: vmovups 4(%rdi), %zmm2 {%k2} {z}
+; KNL-NEXT: vmovups 68(%rdi), %zmm3 {%k1} {z}
+; KNL-NEXT: vcmpltps %zmm3, %zmm1, %k0
+; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k3
+; KNL-NEXT: korw %k3, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: korw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: shll $16, %ecx
+; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: je LBB42_2
; KNL-NEXT: ## %bb.1: ## %L1
; KNL-NEXT: vmovaps %zmm0, (%rdi)
; KNL-NEXT: vmovaps %zmm1, 64(%rdi)
-; KNL-NEXT: jmp LBB42_3
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
; KNL-NEXT: LBB42_2: ## %L2
; KNL-NEXT: vmovaps %zmm0, 4(%rdi)
; KNL-NEXT: vmovaps %zmm1, 68(%rdi)
-; KNL-NEXT: LBB42_3: ## %End
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
@@ -1917,51 +1869,29 @@ define void @ktest_2(<32 x float> %in, f
;
; AVX512DQ-LABEL: ktest_2:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: pushq %rbp
-; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT: .cfi_offset %rbp, -16
-; AVX512DQ-NEXT: movq %rsp, %rbp
-; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp
-; AVX512DQ-NEXT: andq $-32, %rsp
-; AVX512DQ-NEXT: subq $32, %rsp
; AVX512DQ-NEXT: vmovups (%rdi), %zmm2
; AVX512DQ-NEXT: vmovups 64(%rdi), %zmm3
-; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k1
-; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
-; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k2
-; AVX512DQ-NEXT: vpmovm2d %k2, %zmm3
-; AVX512DQ-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
-; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
-; AVX512DQ-NEXT: vcmpltps %zmm5, %zmm0, %k0
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5
-; AVX512DQ-NEXT: vpmovdb %zmm5, %xmm5
-; AVX512DQ-NEXT: vpor %xmm5, %xmm2, %xmm2
-; AVX512DQ-NEXT: vcmpltps %zmm4, %zmm1, %k0
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4
-; AVX512DQ-NEXT: vpmovdb %zmm4, %xmm4
-; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
-; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
-; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
-; AVX512DQ-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
-; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
-; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
-; AVX512DQ-NEXT: kmovw %k0, (%rsp)
-; AVX512DQ-NEXT: cmpl $0, (%rsp)
+; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k1
+; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k2
+; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm2 {%k2} {z}
+; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT: vcmpltps %zmm3, %zmm1, %k0
+; AVX512DQ-NEXT: vcmpltps %zmm2, %zmm0, %k3
+; AVX512DQ-NEXT: korw %k3, %k2, %k2
+; AVX512DQ-NEXT: kmovw %k2, %eax
+; AVX512DQ-NEXT: korw %k0, %k1, %k0
+; AVX512DQ-NEXT: kmovw %k0, %ecx
+; AVX512DQ-NEXT: shll $16, %ecx
+; AVX512DQ-NEXT: orl %eax, %ecx
; AVX512DQ-NEXT: je LBB42_2
; AVX512DQ-NEXT: ## %bb.1: ## %L1
; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi)
; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi)
-; AVX512DQ-NEXT: jmp LBB42_3
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB42_2: ## %L2
; AVX512DQ-NEXT: vmovaps %zmm0, 4(%rdi)
; AVX512DQ-NEXT: vmovaps %zmm1, 68(%rdi)
-; AVX512DQ-NEXT: LBB42_3: ## %End
-; AVX512DQ-NEXT: movq %rbp, %rsp
-; AVX512DQ-NEXT: popq %rbp
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%addr1 = getelementptr float, float * %base, i64 0
@@ -2334,14 +2264,14 @@ define void @store_16i1(<16 x i1>* %a, <
define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
; KNL-LABEL: store_32i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, 2(%rdi)
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@@ -2364,14 +2294,14 @@ define void @store_32i1(<32 x i1>* %a, <
;
; AVX512DQ-LABEL: store_32i1:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512DQ-NEXT: kmovw %k0, 2(%rdi)
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -2383,12 +2313,12 @@ define void @store_32i1_1(<32 x i1>* %a,
; KNL-LABEL: store_32i1_1:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, 2(%rdi)
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@@ -2412,12 +2342,12 @@ define void @store_32i1_1(<32 x i1>* %a,
; AVX512DQ-LABEL: store_32i1_1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1
-; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512DQ-NEXT: kmovw %k0, 2(%rdi)
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm0
+; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -2431,21 +2361,21 @@ define void @store_64i1(<64 x i1>* %a, <
;
; KNL-LABEL: store_64i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
-; KNL-NEXT: vpslld $31, %zmm3, %zmm3
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
-; KNL-NEXT: kmovw %k0, 6(%rdi)
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT: vpslld $31, %zmm2, %zmm2
-; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
-; KNL-NEXT: kmovw %k0, 4(%rdi)
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, 2(%rdi)
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k3
+; KNL-NEXT: kmovw %k3, 6(%rdi)
+; KNL-NEXT: kmovw %k2, 4(%rdi)
+; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kmovw %k0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@@ -2468,21 +2398,21 @@ define void @store_64i1(<64 x i1>* %a, <
;
; AVX512DQ-LABEL: store_64i1:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
-; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
-; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
-; AVX512DQ-NEXT: kmovw %k0, 6(%rdi)
-; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
-; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
-; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
-; AVX512DQ-NEXT: kmovw %k0, 4(%rdi)
-; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512DQ-NEXT: kmovw %k0, 2(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0
+; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0
+; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k2
+; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm0
+; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k3
+; AVX512DQ-NEXT: kmovw %k3, 6(%rdi)
+; AVX512DQ-NEXT: kmovw %k2, 4(%rdi)
+; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll Tue Jan 23 06:25:39 2018
@@ -240,18 +240,18 @@ declare <16 x double> @llvm.masked.load.
define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
; AVX512F-LABEL: test_load_32f64:
; AVX512F: ## %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm5
; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1
-; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k1}
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
-; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k2}
+; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2}
+; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k2, %k2
+; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k2}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
-; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
-; AVX512F-NEXT: kshiftrw $8, %k2, %k1
; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1}
; AVX512F-NEXT: vmovapd %zmm5, %zmm2
; AVX512F-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll Tue Jan 23 06:25:39 2018
@@ -206,21 +206,12 @@ define i16 @test12(<16 x i64> %a, <16 x
define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-LABEL: test12_v32i32:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $32, %rsp
-; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, (%rsp)
-; KNL-NEXT: movl (%rsp), %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
+; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: shll $16, %eax
+; KNL-NEXT: orl %ecx, %eax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
@@ -249,32 +240,28 @@ define i32 @test12_v32i32(<32 x i32> %a,
define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-LABEL: test12_v64i16:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, (%rsp)
-; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: shll $16, %ecx
+; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: movl (%rsp), %ecx
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: shll $16, %eax
+; KNL-NEXT: orl %edx, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: orq %rcx, %rax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
More information about the llvm-commits
mailing list