[llvm] r320849 - [SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 15 11:35:22 PST 2017
Author: ctopper
Date: Fri Dec 15 11:35:22 2017
New Revision: 320849
URL: http://llvm.org/viewvc/llvm-project?rev=320849&view=rev
Log:
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index
Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.
We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.
For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.
For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.
Reviewers: RKSimon, delena, spatel, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40942
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp?rev=320849&r1=320848&r2=320849&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Fri Dec 15 11:35:22 2017
@@ -1054,9 +1054,20 @@ void DAGTypeLegalizer::SplitVecRes_INSER
if (CustomLowerNode(N, N->getValueType(0), true))
return;
- // Spill the vector to the stack.
+ // Make the vector elements byte-addressable if they aren't already.
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
+ if (VecVT.getScalarSizeInBits() < 8) {
+ EltVT = MVT::i8;
+ VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+ VecVT.getVectorNumElements());
+ Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
+ // Extend the element type to match if needed.
+ if (EltVT.bitsGT(Elt.getValueType()))
+ Elt = DAG.getNode(ISD::ANY_EXTEND, dl, EltVT, Elt);
+ }
+
+ // Spill the vector to the stack.
SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
@@ -1071,19 +1082,29 @@ void DAGTypeLegalizer::SplitVecRes_INSER
Store = DAG.getTruncStore(Store, dl, Elt, EltPtr,
MachinePointerInfo::getUnknownStack(MF), EltVT);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
+
// Load the Lo part from the stack slot.
- Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo);
+ Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo);
// Increment the pointer to the other part.
- unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
+ unsigned IncrementSize = LoVT.getSizeInBits() / 8;
StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
DAG.getConstant(IncrementSize, dl,
StackPtr.getValueType()));
// Load the Hi part from the stack slot.
- Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
+ Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
PtrInfo.getWithOffset(IncrementSize),
MinAlign(Alignment, IncrementSize));
+
+ // If we adjusted the original type, we need to truncate the results.
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+ if (LoVT != Lo.getValueType())
+ Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Lo);
+ if (HiVT != Hi.getValueType())
+ Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
}
void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=320849&r1=320848&r2=320849&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Dec 15 11:35:22 2017
@@ -14539,7 +14539,7 @@ static SDValue ExtractBitFromMaskVector(
// Extending v8i1/v16i1 to 512-bit get better performance on KNL
// than extending to 128/256bit.
unsigned VecSize = (NumElts <= 4 ? 128 : 512);
- MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts);
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
ExtVT.getVectorElementType(), Ext, Idx);
@@ -14725,8 +14725,10 @@ static SDValue InsertBitToMaskVector(SDV
if (!isa<ConstantSDNode>(Idx)) {
// Non constant index. Extend source and destination,
// insert element and then truncate the result.
- MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
- MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
+ unsigned NumElts = VecVT.getVectorNumElements();
+ unsigned VecSize = (NumElts <= 4 ? 128 : 512);
+ MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+ MVT ExtEltVT = ExtVecVT.getVectorElementType();
SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=320849&r1=320848&r2=320849&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Fri Dec 15 11:35:22 2017
@@ -1774,3 +1774,592 @@ define <8 x i64> @insert_double_zero(<2
%e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
ret <8 x i64> %e
}
+
+define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v32i1:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $96, %rsp
+; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
+; KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: andl $31, %esi
+; KNL-NEXT: testb %dil, %dil
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; KNL-NEXT: setne (%rsi,%rax)
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: movl (%rsp), %eax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_insertelement_variable_v32i1:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k1
+; SKX-NEXT: xorl %eax, %eax
+; SKX-NEXT: testb %dil, %dil
+; SKX-NEXT: setne %al
+; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: andl $31, %esi
+; SKX-NEXT: movw %ax, (%rsp,%rsi,2)
+; SKX-NEXT: vpsllw $15, (%rsp), %zmm0
+; SKX-NEXT: vpmovw2m %zmm0, %k0
+; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %t1 = icmp ugt <32 x i8> %a, zeroinitializer
+ %t2 = icmp ugt i8 %b, 0
+ %t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index
+ %t4 = bitcast <32 x i1> %t3 to i32
+ ret i32 %t4
+}
+
+define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v64i1:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $192, %rsp
+; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
+; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
+; KNL-NEXT: andl $63, %esi
+; KNL-NEXT: testb %dil, %dil
+; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; KNL-NEXT: setne (%rsi,%rax)
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; KNL-NEXT: movl (%rsp), %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_insertelement_variable_v64i1:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
+; SKX-NEXT: andl $63, %esi
+; SKX-NEXT: testb %dil, %dil
+; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: movq %rsp, %rax
+; SKX-NEXT: setne (%rsi,%rax)
+; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: kmovq %k0, %rax
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %t1 = icmp ugt <64 x i8> %a, zeroinitializer
+ %t2 = icmp ugt i8 %b, 0
+ %t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index
+ %t4 = bitcast <64 x i1> %t3 to i64
+ ret i64 %t4
+}
+
+define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v96i1:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-128, %rsp
+; KNL-NEXT: subq $384, %rsp ## imm = 0x180
+; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT: vmovd %edi, %xmm2
+; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm3, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; KNL-NEXT: movl 744(%rbp), %eax
+; KNL-NEXT: andl $127, %eax
+; KNL-NEXT: cmpb $0, 736(%rbp)
+; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
+; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; KNL-NEXT: setne (%rax,%rcx)
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4
+; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT: vpslld $31, %zmm4, %zmm4
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: movl (%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; KNL-NEXT: shlq $32, %rdx
+; KNL-NEXT: orq %rcx, %rdx
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_insertelement_variable_v96i1:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-128, %rsp
+; SKX-NEXT: subq $256, %rsp ## imm = 0x100
+; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: vmovd %edi, %xmm1
+; SKX-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 104(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $2, 112(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $3, 120(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $4, 128(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $5, 136(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $6, 144(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $7, 152(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $8, 160(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $9, 168(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $10, 176(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $11, 184(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $12, 192(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $13, 200(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $14, 208(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $15, 216(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT: movl 744(%rbp), %eax
+; SKX-NEXT: andl $127, %eax
+; SKX-NEXT: cmpb $0, 736(%rbp)
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z}
+; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: movq %rsp, %rcx
+; SKX-NEXT: setne (%rax,%rcx)
+; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k1
+; SKX-NEXT: kmovq %k1, %rax
+; SKX-NEXT: kmovq %k0, %rdx
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %t1 = icmp ugt <96 x i8> %a, zeroinitializer
+ %t2 = icmp ugt i8 %b, 0
+ %t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index
+ %t4 = bitcast <96 x i1> %t3 to i96
+ ret i96 %t4
+}
+
+define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v128i1:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-128, %rsp
+; KNL-NEXT: subq $384, %rsp ## imm = 0x180
+; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
+; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vpxor %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vpcmpgtb %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vpxor %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vpcmpgtb %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; KNL-NEXT: vpcmpgtb %ymm4, %ymm3, %ymm3
+; KNL-NEXT: andl $127, %esi
+; KNL-NEXT: testb %dil, %dil
+; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; KNL-NEXT: setne (%rsi,%rax)
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4
+; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT: vpslld $31, %zmm4, %zmm4
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: movl (%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; KNL-NEXT: shlq $32, %rdx
+; KNL-NEXT: orq %rcx, %rdx
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_insertelement_variable_v128i1:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-128, %rsp
+; SKX-NEXT: subq $256, %rsp ## imm = 0x100
+; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT: andl $127, %esi
+; SKX-NEXT: testb %dil, %dil
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z}
+; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: movq %rsp, %rax
+; SKX-NEXT: setne (%rsi,%rax)
+; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k1
+; SKX-NEXT: kmovq %k1, %rax
+; SKX-NEXT: kmovq %k0, %rdx
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %t1 = icmp ugt <128 x i8> %a, zeroinitializer
+ %t2 = icmp ugt i8 %b, 0
+ %t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index
+ %t4 = bitcast <128 x i1> %t3 to i128
+ ret i128 %t4
+}
More information about the llvm-commits
mailing list