[llvm] r320849 - [SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 15 11:35:22 PST 2017


Author: ctopper
Date: Fri Dec 15 11:35:22 2017
New Revision: 320849

URL: http://llvm.org/viewvc/llvm-project?rev=320849&view=rev
Log:
[SelectionDAG][X86] Fix insert_vector_elt lowering for v32i1/v64i1 with non-constant index

Summary:
Currently we don't handle v32i1/v64i1 insert_vector_elt correctly as we fail to look at the number of elements closely and assume it can only be v16i1 or v8i1.

We also can't type legalize v64i1 insert_vector_elt correctly on KNL due to the type not being byte addressable as required by the legalizing through memory accesses path requires.

For the first issue, the patch now tries to pick a 512-bit register with the correct number of elements and promotes to that.

For the second issue, we now extend the vector to a byte addressable type, do the stores to memory, load the two halves, and then truncate the halves back to the original type. Technically since we changed the type, we may not need two loads, but actually checking that is more work and for the v64i1 case we do need them.

Reviewers: RKSimon, delena, spatel, zvi

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D40942

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp?rev=320849&r1=320848&r2=320849&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Fri Dec 15 11:35:22 2017
@@ -1054,9 +1054,20 @@ void DAGTypeLegalizer::SplitVecRes_INSER
   if (CustomLowerNode(N, N->getValueType(0), true))
     return;
 
-  // Spill the vector to the stack.
+  // Make the vector elements byte-addressable if they aren't already.
   EVT VecVT = Vec.getValueType();
   EVT EltVT = VecVT.getVectorElementType();
+  if (VecVT.getScalarSizeInBits() < 8) {
+    EltVT = MVT::i8;
+    VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                             VecVT.getVectorNumElements());
+    Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
+    // Extend the element type to match if needed.
+    if (EltVT.bitsGT(Elt.getValueType()))
+      Elt = DAG.getNode(ISD::ANY_EXTEND, dl, EltVT, Elt);
+  }
+
+  // Spill the vector to the stack.
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   auto &MF = DAG.getMachineFunction();
   auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
@@ -1071,19 +1082,29 @@ void DAGTypeLegalizer::SplitVecRes_INSER
   Store = DAG.getTruncStore(Store, dl, Elt, EltPtr,
                             MachinePointerInfo::getUnknownStack(MF), EltVT);
 
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
+
   // Load the Lo part from the stack slot.
-  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo);
+  Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo);
 
   // Increment the pointer to the other part.
-  unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
+  unsigned IncrementSize = LoVT.getSizeInBits() / 8;
   StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
                          DAG.getConstant(IncrementSize, dl,
                                          StackPtr.getValueType()));
 
   // Load the Hi part from the stack slot.
-  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
+  Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
                    PtrInfo.getWithOffset(IncrementSize),
                    MinAlign(Alignment, IncrementSize));
+
+  // If we adjusted the original type, we need to truncate the results.
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  if (LoVT != Lo.getValueType())
+    Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Lo);
+  if (HiVT != Hi.getValueType())
+    Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=320849&r1=320848&r2=320849&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Dec 15 11:35:22 2017
@@ -14539,7 +14539,7 @@ static SDValue ExtractBitFromMaskVector(
     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
     // than extending to 128/256bit.
     unsigned VecSize = (NumElts <= 4 ? 128 : 512);
-    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts);
     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               ExtVT.getVectorElementType(), Ext, Idx);
@@ -14725,8 +14725,10 @@ static SDValue InsertBitToMaskVector(SDV
   if (!isa<ConstantSDNode>(Idx)) {
     // Non constant index. Extend source and destination,
     // insert element and then truncate the result.
-    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
-    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
+    unsigned NumElts = VecVT.getVectorNumElements();
+    unsigned VecSize = (NumElts <= 4 ? 128 : 512);
+    MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+    MVT ExtEltVT = ExtVecVT.getVectorElementType();
     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=320849&r1=320848&r2=320849&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Fri Dec 15 11:35:22 2017
@@ -1774,3 +1774,592 @@ define <8 x i64> @insert_double_zero(<2
   %e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i64> %e
 }
+
+define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v32i1:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $96, %rsp
+; KNL-NEXT:    ## kill: def %esi killed %esi def %rsi
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    andl $31, %esi
+; KNL-NEXT:    testb %dil, %dil
+; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; KNL-NEXT:    setne (%rsi,%rax)
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, (%rsp)
+; KNL-NEXT:    movl (%rsp), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insertelement_variable_v32i1:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: def %esi killed %esi def %rsi
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k1
+; SKX-NEXT:    xorl %eax, %eax
+; SKX-NEXT:    testb %dil, %dil
+; SKX-NEXT:    setne %al
+; SKX-NEXT:    vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT:    andl $31, %esi
+; SKX-NEXT:    movw %ax, (%rsp,%rsi,2)
+; SKX-NEXT:    vpsllw $15, (%rsp), %zmm0
+; SKX-NEXT:    vpmovw2m %zmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <32 x i8> %a, zeroinitializer
+  %t2 = icmp ugt i8 %b, 0
+  %t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index
+  %t4 = bitcast <32 x i1> %t3 to i32
+  ret i32 %t4
+}
+
+define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v64i1:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $192, %rsp
+; KNL-NEXT:    ## kill: def %esi killed %esi def %rsi
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    andl $63, %esi
+; KNL-NEXT:    testb %dil, %dil
+; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; KNL-NEXT:    setne (%rsi,%rax)
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, (%rsp)
+; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; KNL-NEXT:    movl (%rsp), %eax
+; KNL-NEXT:    shlq $32, %rax
+; KNL-NEXT:    orq %rcx, %rax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insertelement_variable_v64i1:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: def %esi killed %esi def %rsi
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1
+; SKX-NEXT:    andl $63, %esi
+; SKX-NEXT:    testb %dil, %dil
+; SKX-NEXT:    vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT:    movq %rsp, %rax
+; SKX-NEXT:    setne (%rsi,%rax)
+; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT:    vpmovb2m %zmm0, %k0
+; SKX-NEXT:    kmovq %k0, %rax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <64 x i8> %a, zeroinitializer
+  %t2 = icmp ugt i8 %b, 0
+  %t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index
+  %t4 = bitcast <64 x i1> %t3 to i64
+  ret i64 %t4
+}
+
+define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v96i1:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-128, %rsp
+; KNL-NEXT:    subq $384, %rsp ## imm = 0x180
+; KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 488(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $2, 496(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $3, 504(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $4, 512(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $5, 520(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $6, 528(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $7, 536(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $8, 544(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $9, 552(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $10, 560(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $11, 568(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $12, 576(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $13, 584(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $14, 592(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $15, 600(%rbp), %xmm0, %xmm0
+; KNL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 616(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $2, 624(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $3, 632(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $4, 640(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $5, 648(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $6, 656(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $7, 664(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $8, 672(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $9, 680(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $10, 688(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $11, 696(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $12, 704(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $13, 712(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $14, 720(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $15, 728(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 232(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $2, 240(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $3, 248(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $4, 256(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $5, 264(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $6, 272(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $7, 280(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $8, 288(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $9, 296(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $10, 304(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $11, 312(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $12, 320(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $13, 328(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $14, 336(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $15, 344(%rbp), %xmm1, %xmm1
+; KNL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 360(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $2, 368(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $3, 376(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $4, 384(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $5, 392(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $6, 400(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $7, 408(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $8, 416(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $9, 424(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $10, 432(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $11, 440(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $12, 448(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $13, 456(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $14, 464(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $15, 472(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT:    vmovd %edi, %xmm2
+; KNL-NEXT:    vpinsrb $1, %esi, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $4, %r8d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $5, %r9d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $6, 16(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $7, 24(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $8, 32(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $9, 40(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $10, 48(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $11, 56(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $12, 64(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $13, 72(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $14, 80(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $15, 88(%rbp), %xmm2, %xmm2
+; KNL-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 104(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $2, 112(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $3, 120(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $4, 128(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $5, 136(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $6, 144(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $7, 152(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $8, 160(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $9, 168(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $10, 176(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $11, 184(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $12, 192(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $13, 200(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $14, 208(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $15, 216(%rbp), %xmm3, %xmm3
+; KNL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; KNL-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm2
+; KNL-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm1
+; KNL-NEXT:    vpxor %ymm3, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpgtb %ymm3, %ymm0, %ymm0
+; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; KNL-NEXT:    movl 744(%rbp), %eax
+; KNL-NEXT:    andl $127, %eax
+; KNL-NEXT:    cmpb $0, 736(%rbp)
+; KNL-NEXT:    vmovdqa %ymm3, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovdqa %ymm2, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; KNL-NEXT:    setne (%rax,%rcx)
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm2
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; KNL-NEXT:    vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT:    vpslld $31, %zmm4, %zmm4
+; KNL-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm2, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm3, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, (%rsp)
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; KNL-NEXT:    shlq $32, %rax
+; KNL-NEXT:    orq %rcx, %rax
+; KNL-NEXT:    movl (%rsp), %ecx
+; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insertelement_variable_v96i1:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-128, %rsp
+; SKX-NEXT:    subq $256, %rsp ## imm = 0x100
+; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    vpinsrb $1, 232(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $2, 240(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $3, 248(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $4, 256(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $5, 264(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $6, 272(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $7, 280(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $8, 288(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $9, 296(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $10, 304(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $11, 312(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $12, 320(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $13, 328(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $14, 336(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $15, 344(%rbp), %xmm0, %xmm0
+; SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SKX-NEXT:    vpinsrb $1, 360(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $2, 368(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $3, 376(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $4, 384(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $5, 392(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $6, 400(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $7, 408(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $8, 416(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $9, 424(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $10, 432(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $11, 440(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $12, 448(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $13, 456(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $14, 464(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $15, 472(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vmovd %edi, %xmm1
+; SKX-NEXT:    vpinsrb $1, %esi, %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $4, %r8d, %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $5, %r9d, %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $6, 16(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $7, 24(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $8, 32(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $9, 40(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $10, 48(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $11, 56(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $12, 64(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $13, 72(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $14, 80(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $15, 88(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SKX-NEXT:    vpinsrb $1, 104(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $2, 112(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $3, 120(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $4, 128(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $5, 136(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $6, 144(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $7, 152(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $8, 160(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $9, 168(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $10, 176(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $11, 184(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $12, 192(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $13, 200(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $14, 208(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $15, 216(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; SKX-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SKX-NEXT:    vpinsrb $1, 488(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $2, 496(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $3, 504(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $4, 512(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $5, 520(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $6, 528(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $7, 536(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $8, 544(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $9, 552(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $10, 560(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $11, 568(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $12, 576(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $13, 584(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $14, 592(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vpinsrb $15, 600(%rbp), %xmm1, %xmm1
+; SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SKX-NEXT:    vpinsrb $1, 616(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $2, 624(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $3, 632(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $4, 640(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $5, 648(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $6, 656(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $7, 664(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $8, 672(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $9, 680(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $10, 688(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $11, 696(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $12, 704(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $13, 712(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $14, 720(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vpinsrb $15, 728(%rbp), %xmm2, %xmm2
+; SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k1
+; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT:    movl 744(%rbp), %eax
+; SKX-NEXT:    andl $127, %eax
+; SKX-NEXT:    cmpb $0, 736(%rbp)
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SKX-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2} {z}
+; SKX-NEXT:    vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
+; SKX-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT:    movq %rsp, %rcx
+; SKX-NEXT:    setne (%rax,%rcx)
+; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
+; SKX-NEXT:    vpmovb2m %zmm0, %k0
+; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT:    vpmovb2m %zmm0, %k1
+; SKX-NEXT:    kmovq %k1, %rax
+; SKX-NEXT:    kmovq %k0, %rdx
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <96 x i8> %a, zeroinitializer
+  %t2 = icmp ugt i8 %b, 0
+  %t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index
+  %t4 = bitcast <96 x i1> %t3 to i96
+  ret i96 %t4
+}
+
+define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v128i1:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-128, %rsp
+; KNL-NEXT:    subq $384, %rsp ## imm = 0x180
+; KNL-NEXT:    ## kill: def %esi killed %esi def %rsi
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
+; KNL-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtb %ymm4, %ymm1, %ymm1
+; KNL-NEXT:    vpxor %ymm4, %ymm2, %ymm2
+; KNL-NEXT:    vpcmpgtb %ymm4, %ymm2, %ymm2
+; KNL-NEXT:    vpxor %ymm4, %ymm3, %ymm3
+; KNL-NEXT:    vpcmpgtb %ymm4, %ymm3, %ymm3
+; KNL-NEXT:    andl $127, %esi
+; KNL-NEXT:    testb %dil, %dil
+; KNL-NEXT:    vmovdqa %ymm3, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovdqa %ymm2, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; KNL-NEXT:    setne (%rsi,%rax)
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm2
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3
+; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; KNL-NEXT:    vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT:    vpslld $31, %zmm4, %zmm4
+; KNL-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm2, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm3, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, (%rsp)
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; KNL-NEXT:    shlq $32, %rax
+; KNL-NEXT:    orq %rcx, %rax
+; KNL-NEXT:    movl (%rsp), %ecx
+; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insertelement_variable_v128i1:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-128, %rsp
+; SKX-NEXT:    subq $256, %rsp ## imm = 0x100
+; SKX-NEXT:    ## kill: def %esi killed %esi def %rsi
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k1
+; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT:    andl $127, %esi
+; SKX-NEXT:    testb %dil, %dil
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SKX-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2} {z}
+; SKX-NEXT:    vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
+; SKX-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT:    movq %rsp, %rax
+; SKX-NEXT:    setne (%rsi,%rax)
+; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
+; SKX-NEXT:    vpmovb2m %zmm0, %k0
+; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT:    vpmovb2m %zmm0, %k1
+; SKX-NEXT:    kmovq %k1, %rax
+; SKX-NEXT:    kmovq %k0, %rdx
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <128 x i8> %a, zeroinitializer
+  %t2 = icmp ugt i8 %b, 0
+  %t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index
+  %t4 = bitcast <128 x i1> %t3 to i128
+  ret i128 %t4
+}




More information about the llvm-commits mailing list