[llvm] r265259 - AVX-512: Load and Extended Load for i1 vectors

Elena Demikhovsky via llvm-commits llvm-commits at lists.llvm.org
Sun Apr 3 01:41:14 PDT 2016


Author: delena
Date: Sun Apr  3 03:41:12 2016
New Revision: 265259

URL: http://llvm.org/viewvc/llvm-project?rev=265259&view=rev
Log:
AVX-512: Load and Extended Load for i1 vectors

Implemented load+{sign|zero}_extend for i1 vectors
Fixed failures in i1 vector load.
Covered loading of v2i1, v4i1, v8i1, v16i1, v32i1, v64i1 vectors for KNL and SKX.

Differential Revision: http://reviews.llvm.org/D18737


Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=265259&r1=265258&r2=265259&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Apr  3 03:41:12 2016
@@ -1384,8 +1384,17 @@ X86TargetLowering::X86TargetLowering(con
     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v16i1,  Legal);
+    setOperationAction(ISD::LOAD,               MVT::v8i1,   Legal);
 
+    for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
+                   MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
+                   MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
+      setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
+    }
     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
@@ -1661,6 +1670,8 @@ X86TargetLowering::X86TargetLowering(con
     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
 
+    setOperationAction(ISD::LOAD,               MVT::v32i1, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v64i1, Legal);
     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
@@ -1757,6 +1768,8 @@ X86TargetLowering::X86TargetLowering(con
     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
 
+    setOperationAction(ISD::LOAD,               MVT::v2i1, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4i1, Legal);
     setOperationAction(ISD::TRUNCATE,           MVT::v2i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v4i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
@@ -16093,6 +16106,98 @@ static SDValue LowerSIGN_EXTEND(SDValue
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
+static SDValue LowerExtended1BitVectorLoad(SDValue Op,
+                                           const X86Subtarget &Subtarget,
+                                           SelectionDAG &DAG) {
+
+  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+  SDLoc dl(Ld);
+  EVT MemVT = Ld->getMemoryVT();
+  assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
+         "Expected i1 vector load");
+  unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
+    ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+  MVT VT = Op.getValueType().getSimpleVT();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+      NumElts == 16) {
+    // Load and extend - everything is legal
+    if (NumElts < 8) {
+      SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
+                                 Ld->getBasePtr(),
+                                 Ld->getMemOperand());
+      // Replace chain users with the new chain.
+      assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+      MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+      SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
+
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                                   DAG.getIntPtrConstant(0, dl));
+    }
+    SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
+                               Ld->getBasePtr(),
+                               Ld->getMemOperand());
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    // Finally, do a normal sign-extend to the desired register.
+    return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
+  }
+
+  if (NumElts <= 8) {
+    // A subset, assume that we have only AVX-512F
+    unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+    MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
+    SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
+                              Ld->getBasePtr(),
+                              Ld->getMemOperand());
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
+    SDValue BitVec = DAG.getBitcast(MaskVT, Load);
+
+    if (NumElts == 8)
+      return DAG.getNode(ExtOpcode, dl, VT, BitVec);
+
+      // we should take care to v4i1 and v2i1
+
+    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+    SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                        DAG.getIntPtrConstant(0, dl));
+  }
+
+  assert(VT == MVT::v32i8 && "Unexpected extload type");
+
+  SmallVector<SDValue, 2> Chains;
+
+  SDValue BasePtr = Ld->getBasePtr();
+  SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+                               Ld->getBasePtr(),
+                               Ld->getMemOperand());
+  Chains.push_back(LoadLo.getValue(1));
+
+  SDValue BasePtrHi =
+    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+  SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+                               BasePtrHi,
+                               Ld->getMemOperand());
+  Chains.push_back(LoadHi.getValue(1));
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+
+  SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
+  SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
+}
+
 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
 // may emit an illegal shuffle but the expansion is still better than scalar
 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
@@ -16113,6 +16218,9 @@ static SDValue LowerExtendedLoad(SDValue
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
   EVT MemVT = Ld->getMemoryVT();
+  if (MemVT.getScalarType() == MVT::i1)
+    return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned RegSz = RegVT.getSizeInBits();
 

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=265259&r1=265258&r2=265259&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Sun Apr  3 03:41:12 2016
@@ -2091,6 +2091,11 @@ let Predicates = [HasDQI] in {
             (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
   def : Pat<(store VK1:$src, addr:$dst),
             (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
+
+  def : Pat<(v2i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
+  def : Pat<(v4i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
 }
 let Predicates = [HasAVX512, NoDQI] in {
   def : Pat<(store VK1:$src, addr:$dst),
@@ -2110,18 +2115,19 @@ let Predicates = [HasAVX512, NoDQI] in {
              (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
               sub_8bit))>;
 
-  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
-            (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
-  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
-            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+  def : Pat<(v8i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (MOVZX16rm8 addr:$src), VK8)>;
+  def : Pat<(v2i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (MOVZX16rm8 addr:$src), VK2)>;
+  def : Pat<(v4i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (MOVZX16rm8 addr:$src), VK4)>;
 }
+
 let Predicates = [HasAVX512] in {
   def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
             (KMOVWmk addr:$dst, VK16:$src)>;
   def : Pat<(i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (AND16ri (i16 (SUBREG_TO_REG (i32 0),
-                                              (MOV8rm addr:$src), sub_8bit)),
-                                (i16 1)), VK1)>;
+            (COPY_TO_REGCLASS (AND16ri (MOVZX16rm8 addr:$src), (i16 1)), VK1)>;
   def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
             (KMOVWkm addr:$src)>;
 }
@@ -2130,8 +2136,6 @@ let Predicates = [HasBWI] in {
             (KMOVDmk addr:$dst, VK32:$src)>;
   def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
             (KMOVDkm addr:$src)>;
-}
-let Predicates = [HasBWI] in {
   def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
             (KMOVQmk addr:$dst, VK64:$src)>;
   def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=265259&r1=265258&r2=265259&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Sun Apr  3 03:41:12 2016
@@ -200,7 +200,7 @@ define i16 @test15(i1 *%addr) {
 }
 
 ;CHECK-LABEL: test16
-;CHECK: movb (%rdi), %al
+;CHECK: movzbw (%rdi), %ax
 ;CHECK: kmovw
 ;CHECK: kshiftlw        $10
 ;CHECK: korw
@@ -214,7 +214,7 @@ define i16 @test16(i1 *%addr, i16 %a) {
 }
 
 ;CHECK-LABEL: test17
-;KNL: movb (%rdi), %al
+;KNL: movzbw (%rdi), %ax
 ;KNL: andw $1, %ax
 ;KNL: kshiftlw $4
 ;KNL: korw

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=265259&r1=265258&r2=265259&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Sun Apr  3 03:41:12 2016
@@ -53,9 +53,11 @@ define void @mask16_mem(i16* %ptr) {
 define void @mask8_mem(i8* %ptr) {
 ; KNL-LABEL: mask8_mem:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    kmovw (%rdi), %k0
+; KNL-NEXT:    movb (%rdi), %al
+; KNL-NEXT:    kmovw %eax, %k0
 ; KNL-NEXT:    knotw %k0, %k0
-; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    movb %al, (%rdi)
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: mask8_mem:
@@ -417,444 +419,6 @@ define <16 x i1> @test15(i32 %x, i32 %y)
 }
 
 define <64 x i8> @test16(i64 %x) {
-; KNL-LABEL: test16:
-; KNL:       ## BB#0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:  Ltmp0:
-; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:  Ltmp1:
-; KNL-NEXT:    .cfi_offset %rbp, -16
-; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:  Ltmp2:
-; KNL-NEXT:    .cfi_def_cfa_register %rbp
-; KNL-NEXT:    pushq %r15
-; KNL-NEXT:    pushq %r14
-; KNL-NEXT:    pushq %r13
-; KNL-NEXT:    pushq %r12
-; KNL-NEXT:    pushq %rbx
-; KNL-NEXT:    andq $-32, %rsp
-; KNL-NEXT:    subq $128, %rsp
-; KNL-NEXT:  Ltmp3:
-; KNL-NEXT:    .cfi_offset %rbx, -56
-; KNL-NEXT:  Ltmp4:
-; KNL-NEXT:    .cfi_offset %r12, -48
-; KNL-NEXT:  Ltmp5:
-; KNL-NEXT:    .cfi_offset %r13, -40
-; KNL-NEXT:  Ltmp6:
-; KNL-NEXT:    .cfi_offset %r14, -32
-; KNL-NEXT:  Ltmp7:
-; KNL-NEXT:    .cfi_offset %r15, -24
-; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    shrq $32, %rax
-; KNL-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    movl $271, %eax ## imm = 0x10F
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    movl %edi, %ecx
-; KNL-NEXT:    andl $1, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm0
-; KNL-NEXT:    movl $257, %ecx ## imm = 0x101
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $258, %ecx ## imm = 0x102
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $259, %ecx ## imm = 0x103
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $260, %ecx ## imm = 0x104
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $261, %ecx ## imm = 0x105
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $262, %ecx ## imm = 0x106
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $263, %ecx ## imm = 0x107
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $264, %ecx ## imm = 0x108
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $265, %ecx ## imm = 0x109
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $266, %ecx ## imm = 0x10A
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $267, %ecx ## imm = 0x10B
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $268, %ecx ## imm = 0x10C
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $269, %ecx ## imm = 0x10D
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    movl $270, %ecx ## imm = 0x10E
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm1
-; KNL-NEXT:    movl $1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm0
-; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
-; KNL-NEXT:    movq %r15, %rdx
-; KNL-NEXT:    shrq $17, %rdx
-; KNL-NEXT:    andb $1, %dl
-; KNL-NEXT:    je LBB22_2
-; KNL-NEXT:  ## BB#1:
-; KNL-NEXT:    movb $-1, %dl
-; KNL-NEXT:  LBB22_2:
-; KNL-NEXT:    movq %r15, %r11
-; KNL-NEXT:    shrq $16, %r11
-; KNL-NEXT:    andb $1, %r11b
-; KNL-NEXT:    je LBB22_4
-; KNL-NEXT:  ## BB#3:
-; KNL-NEXT:    movb $-1, %r11b
-; KNL-NEXT:  LBB22_4:
-; KNL-NEXT:    movq %r15, %r10
-; KNL-NEXT:    shrq $18, %r10
-; KNL-NEXT:    andb $1, %r10b
-; KNL-NEXT:    je LBB22_6
-; KNL-NEXT:  ## BB#5:
-; KNL-NEXT:    movb $-1, %r10b
-; KNL-NEXT:  LBB22_6:
-; KNL-NEXT:    movq %r15, %r9
-; KNL-NEXT:    shrq $19, %r9
-; KNL-NEXT:    andb $1, %r9b
-; KNL-NEXT:    je LBB22_8
-; KNL-NEXT:  ## BB#7:
-; KNL-NEXT:    movb $-1, %r9b
-; KNL-NEXT:  LBB22_8:
-; KNL-NEXT:    movq %r15, %rbx
-; KNL-NEXT:    shrq $20, %rbx
-; KNL-NEXT:    andb $1, %bl
-; KNL-NEXT:    je LBB22_10
-; KNL-NEXT:  ## BB#9:
-; KNL-NEXT:    movb $-1, %bl
-; KNL-NEXT:  LBB22_10:
-; KNL-NEXT:    movq %r15, %r12
-; KNL-NEXT:    shrq $21, %r12
-; KNL-NEXT:    andb $1, %r12b
-; KNL-NEXT:    je LBB22_12
-; KNL-NEXT:  ## BB#11:
-; KNL-NEXT:    movb $-1, %r12b
-; KNL-NEXT:  LBB22_12:
-; KNL-NEXT:    movq %r15, %r14
-; KNL-NEXT:    shrq $22, %r14
-; KNL-NEXT:    andb $1, %r14b
-; KNL-NEXT:    je LBB22_14
-; KNL-NEXT:  ## BB#13:
-; KNL-NEXT:    movb $-1, %r14b
-; KNL-NEXT:  LBB22_14:
-; KNL-NEXT:    movq %r15, %r8
-; KNL-NEXT:    shrq $23, %r8
-; KNL-NEXT:    andb $1, %r8b
-; KNL-NEXT:    je LBB22_16
-; KNL-NEXT:  ## BB#15:
-; KNL-NEXT:    movb $-1, %r8b
-; KNL-NEXT:  LBB22_16:
-; KNL-NEXT:    movq %r15, %r13
-; KNL-NEXT:    shrq $24, %r13
-; KNL-NEXT:    andb $1, %r13b
-; KNL-NEXT:    je LBB22_18
-; KNL-NEXT:  ## BB#17:
-; KNL-NEXT:    movb $-1, %r13b
-; KNL-NEXT:  LBB22_18:
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $25, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_20
-; KNL-NEXT:  ## BB#19:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_20:
-; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $26, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_22
-; KNL-NEXT:  ## BB#21:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_22:
-; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    movl $272, %esi ## imm = 0x110
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $27, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_24
-; KNL-NEXT:  ## BB#23:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_24:
-; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    movl $273, %eax ## imm = 0x111
-; KNL-NEXT:    bextrl %esi, %edi, %esi
-; KNL-NEXT:    movq %r15, %rcx
-; KNL-NEXT:    shrq $28, %rcx
-; KNL-NEXT:    andb $1, %cl
-; KNL-NEXT:    je LBB22_26
-; KNL-NEXT:  ## BB#25:
-; KNL-NEXT:    movb $-1, %cl
-; KNL-NEXT:  LBB22_26:
-; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vmovd %esi, %xmm2
-; KNL-NEXT:    movl $274, %esi ## imm = 0x112
-; KNL-NEXT:    movq %r15, %rcx
-; KNL-NEXT:    shrq $29, %rcx
-; KNL-NEXT:    andb $1, %cl
-; KNL-NEXT:    je LBB22_28
-; KNL-NEXT:  ## BB#27:
-; KNL-NEXT:    movb $-1, %cl
-; KNL-NEXT:  LBB22_28:
-; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %esi, %edi, %eax
-; KNL-NEXT:    movzbl %r11b, %esi
-; KNL-NEXT:    movq %r15, %rcx
-; KNL-NEXT:    shrq $30, %rcx
-; KNL-NEXT:    andb $1, %cl
-; KNL-NEXT:    je LBB22_30
-; KNL-NEXT:  ## BB#29:
-; KNL-NEXT:    movb $-1, %cl
-; KNL-NEXT:  LBB22_30:
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; KNL-NEXT:    movl $275, %eax ## imm = 0x113
-; KNL-NEXT:    bextrl %eax, %edi, %r11d
-; KNL-NEXT:    movzbl %dl, %edx
-; KNL-NEXT:    vmovd %esi, %xmm3
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $31, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_32
-; KNL-NEXT:  ## BB#31:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_32:
-; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT:    movl $276, %eax ## imm = 0x114
-; KNL-NEXT:    bextrl %eax, %edi, %esi
-; KNL-NEXT:    movl $277, %r11d ## imm = 0x115
-; KNL-NEXT:    vpinsrb $1, %edx, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r10b, %r10d
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_34
-; KNL-NEXT:  ## BB#33:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_34:
-; KNL-NEXT:    vpinsrb $4, %esi, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %r11d, %edi, %edx
-; KNL-NEXT:    movl $278, %r11d ## imm = 0x116
-; KNL-NEXT:    vpinsrb $2, %r10d, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r9b, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    movq %r15, %rcx
-; KNL-NEXT:    shlq $63, %rcx
-; KNL-NEXT:    sarq $63, %rcx
-; KNL-NEXT:    vmovd %ecx, %xmm4
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb $2, %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_36
-; KNL-NEXT:  ## BB#35:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_36:
-; KNL-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %r11d, %edi, %edx
-; KNL-NEXT:    movl $279, %r9d ## imm = 0x117
-; KNL-NEXT:    vpinsrb $3, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %bl, %ebx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb $3, %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_38
-; KNL-NEXT:  ## BB#37:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_38:
-; KNL-NEXT:    vpinsrb $6, %edx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %r9d, %edi, %edx
-; KNL-NEXT:    movl $280, %esi ## imm = 0x118
-; KNL-NEXT:    vpinsrb $4, %ebx, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r12b, %ebx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb $4, %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_40
-; KNL-NEXT:  ## BB#39:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_40:
-; KNL-NEXT:    vpinsrb $7, %edx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %esi, %edi, %ecx
-; KNL-NEXT:    movl $281, %edx ## imm = 0x119
-; KNL-NEXT:    vpinsrb $5, %ebx, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r14b, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb $5, %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_42
-; KNL-NEXT:  ## BB#41:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_42:
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %ecx
-; KNL-NEXT:    movl $282, %edx ## imm = 0x11A
-; KNL-NEXT:    vpinsrb $6, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r8b, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %bl
-; KNL-NEXT:    shrb $6, %bl
-; KNL-NEXT:    andb $1, %bl
-; KNL-NEXT:    je LBB22_44
-; KNL-NEXT:  ## BB#43:
-; KNL-NEXT:    movb $-1, %bl
-; KNL-NEXT:  LBB22_44:
-; KNL-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %eax
-; KNL-NEXT:    movl $283, %ecx ## imm = 0x11B
-; KNL-NEXT:    vpinsrb $7, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r13b, %esi
-; KNL-NEXT:    movzbl %bl, %edx
-; KNL-NEXT:    vpinsrb $6, %edx, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %bl
-; KNL-NEXT:    shrb $7, %bl
-; KNL-NEXT:    je LBB22_46
-; KNL-NEXT:  ## BB#45:
-; KNL-NEXT:    movb $-1, %bl
-; KNL-NEXT:  LBB22_46:
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    movl $284, %edx ## imm = 0x11C
-; KNL-NEXT:    vpinsrb $8, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
-; KNL-NEXT:    movzbl %al, %esi
-; KNL-NEXT:    movzbl %bl, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $8, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_48
-; KNL-NEXT:  ## BB#47:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_48:
-; KNL-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %ecx
-; KNL-NEXT:    movl $285, %edx ## imm = 0x11D
-; KNL-NEXT:    vpinsrb $9, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT:    movzbl %sil, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $9, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_50
-; KNL-NEXT:  ## BB#49:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_50:
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %ecx
-; KNL-NEXT:    movl $286, %edx ## imm = 0x11E
-; KNL-NEXT:    vpinsrb $10, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT:    movzbl %sil, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $10, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_52
-; KNL-NEXT:  ## BB#51:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_52:
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %edx
-; KNL-NEXT:    vpinsrb $11, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT:    movzbl %cl, %ecx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $11, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_54
-; KNL-NEXT:  ## BB#53:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_54:
-; KNL-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
-; KNL-NEXT:    shrl $31, %edi
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT:    movzbl %cl, %ecx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $12, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_56
-; KNL-NEXT:  ## BB#55:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_56:
-; KNL-NEXT:    vpinsrb $15, %edi, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT:    movzbl %cl, %ecx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $13, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_58
-; KNL-NEXT:  ## BB#57:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_58:
-; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT:    vpinsrb $14, %ecx, %xmm3, %xmm2
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT:    movzbl %cl, %ecx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm3
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $14, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB22_60
-; KNL-NEXT:  ## BB#59:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB22_60:
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; KNL-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm1
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm2
-; KNL-NEXT:    shrq $15, %r15
-; KNL-NEXT:    andb $1, %r15b
-; KNL-NEXT:    je LBB22_62
-; KNL-NEXT:  ## BB#61:
-; KNL-NEXT:    movb $-1, %r15b
-; KNL-NEXT:  LBB22_62:
-; KNL-NEXT:    movzbl %r15b, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
-; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
-; KNL-NEXT:    leaq -40(%rbp), %rsp
-; KNL-NEXT:    popq %rbx
-; KNL-NEXT:    popq %r12
-; KNL-NEXT:    popq %r13
-; KNL-NEXT:    popq %r14
-; KNL-NEXT:    popq %r15
-; KNL-NEXT:    popq %rbp
-; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test16:
 ; SKX:       ## BB#0:
@@ -872,446 +436,6 @@ define <64 x i8> @test16(i64 %x) {
 }
 
 define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
-; KNL-LABEL: test17:
-; KNL:       ## BB#0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:  Ltmp8:
-; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:  Ltmp9:
-; KNL-NEXT:    .cfi_offset %rbp, -16
-; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:  Ltmp10:
-; KNL-NEXT:    .cfi_def_cfa_register %rbp
-; KNL-NEXT:    pushq %r15
-; KNL-NEXT:    pushq %r14
-; KNL-NEXT:    pushq %r13
-; KNL-NEXT:    pushq %r12
-; KNL-NEXT:    pushq %rbx
-; KNL-NEXT:    andq $-32, %rsp
-; KNL-NEXT:    subq $128, %rsp
-; KNL-NEXT:  Ltmp11:
-; KNL-NEXT:    .cfi_offset %rbx, -56
-; KNL-NEXT:  Ltmp12:
-; KNL-NEXT:    .cfi_offset %r12, -48
-; KNL-NEXT:  Ltmp13:
-; KNL-NEXT:    .cfi_offset %r13, -40
-; KNL-NEXT:  Ltmp14:
-; KNL-NEXT:    .cfi_offset %r14, -32
-; KNL-NEXT:  Ltmp15:
-; KNL-NEXT:    .cfi_offset %r15, -24
-; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    shrq $32, %rax
-; KNL-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    movl %edi, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    vmovd %eax, %xmm0
-; KNL-NEXT:    movl $257, %eax ## imm = 0x101
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $258, %eax ## imm = 0x102
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $259, %eax ## imm = 0x103
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $260, %eax ## imm = 0x104
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $261, %eax ## imm = 0x105
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $262, %eax ## imm = 0x106
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $263, %eax ## imm = 0x107
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $264, %eax ## imm = 0x108
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $265, %eax ## imm = 0x109
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $266, %eax ## imm = 0x10A
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $267, %eax ## imm = 0x10B
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $268, %eax ## imm = 0x10C
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $269, %eax ## imm = 0x10D
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $270, %eax ## imm = 0x10E
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT:    movl $271, %eax ## imm = 0x10F
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm1
-; KNL-NEXT:    cmpl %edx, %esi
-; KNL-NEXT:    setg %al
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm0
-; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
-; KNL-NEXT:    movq %r15, %rdx
-; KNL-NEXT:    shrq $17, %rdx
-; KNL-NEXT:    andb $1, %dl
-; KNL-NEXT:    je LBB23_2
-; KNL-NEXT:  ## BB#1:
-; KNL-NEXT:    movb $-1, %dl
-; KNL-NEXT:  LBB23_2:
-; KNL-NEXT:    movq %r15, %r11
-; KNL-NEXT:    shrq $16, %r11
-; KNL-NEXT:    andb $1, %r11b
-; KNL-NEXT:    je LBB23_4
-; KNL-NEXT:  ## BB#3:
-; KNL-NEXT:    movb $-1, %r11b
-; KNL-NEXT:  LBB23_4:
-; KNL-NEXT:    movq %r15, %r10
-; KNL-NEXT:    shrq $18, %r10
-; KNL-NEXT:    andb $1, %r10b
-; KNL-NEXT:    je LBB23_6
-; KNL-NEXT:  ## BB#5:
-; KNL-NEXT:    movb $-1, %r10b
-; KNL-NEXT:  LBB23_6:
-; KNL-NEXT:    movq %r15, %r9
-; KNL-NEXT:    shrq $19, %r9
-; KNL-NEXT:    andb $1, %r9b
-; KNL-NEXT:    je LBB23_8
-; KNL-NEXT:  ## BB#7:
-; KNL-NEXT:    movb $-1, %r9b
-; KNL-NEXT:  LBB23_8:
-; KNL-NEXT:    movq %r15, %rbx
-; KNL-NEXT:    shrq $20, %rbx
-; KNL-NEXT:    andb $1, %bl
-; KNL-NEXT:    je LBB23_10
-; KNL-NEXT:  ## BB#9:
-; KNL-NEXT:    movb $-1, %bl
-; KNL-NEXT:  LBB23_10:
-; KNL-NEXT:    movq %r15, %r12
-; KNL-NEXT:    shrq $21, %r12
-; KNL-NEXT:    andb $1, %r12b
-; KNL-NEXT:    je LBB23_12
-; KNL-NEXT:  ## BB#11:
-; KNL-NEXT:    movb $-1, %r12b
-; KNL-NEXT:  LBB23_12:
-; KNL-NEXT:    movq %r15, %r14
-; KNL-NEXT:    shrq $22, %r14
-; KNL-NEXT:    andb $1, %r14b
-; KNL-NEXT:    je LBB23_14
-; KNL-NEXT:  ## BB#13:
-; KNL-NEXT:    movb $-1, %r14b
-; KNL-NEXT:  LBB23_14:
-; KNL-NEXT:    movq %r15, %r8
-; KNL-NEXT:    shrq $23, %r8
-; KNL-NEXT:    andb $1, %r8b
-; KNL-NEXT:    je LBB23_16
-; KNL-NEXT:  ## BB#15:
-; KNL-NEXT:    movb $-1, %r8b
-; KNL-NEXT:  LBB23_16:
-; KNL-NEXT:    movq %r15, %r13
-; KNL-NEXT:    shrq $24, %r13
-; KNL-NEXT:    andb $1, %r13b
-; KNL-NEXT:    je LBB23_18
-; KNL-NEXT:  ## BB#17:
-; KNL-NEXT:    movb $-1, %r13b
-; KNL-NEXT:  LBB23_18:
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $25, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_20
-; KNL-NEXT:  ## BB#19:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_20:
-; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $26, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_22
-; KNL-NEXT:  ## BB#21:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_22:
-; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    movl $272, %esi ## imm = 0x110
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $27, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_24
-; KNL-NEXT:  ## BB#23:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_24:
-; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    movl $273, %eax ## imm = 0x111
-; KNL-NEXT:    bextrl %esi, %edi, %esi
-; KNL-NEXT:    movq %r15, %rcx
-; KNL-NEXT:    shrq $28, %rcx
-; KNL-NEXT:    andb $1, %cl
-; KNL-NEXT:    je LBB23_26
-; KNL-NEXT:  ## BB#25:
-; KNL-NEXT:    movb $-1, %cl
-; KNL-NEXT:  LBB23_26:
-; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    bextrl %eax, %edi, %eax
-; KNL-NEXT:    vmovd %esi, %xmm2
-; KNL-NEXT:    movl $274, %esi ## imm = 0x112
-; KNL-NEXT:    movq %r15, %rcx
-; KNL-NEXT:    shrq $29, %rcx
-; KNL-NEXT:    andb $1, %cl
-; KNL-NEXT:    je LBB23_28
-; KNL-NEXT:  ## BB#27:
-; KNL-NEXT:    movb $-1, %cl
-; KNL-NEXT:  LBB23_28:
-; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %esi, %edi, %eax
-; KNL-NEXT:    movzbl %r11b, %esi
-; KNL-NEXT:    movq %r15, %rcx
-; KNL-NEXT:    shrq $30, %rcx
-; KNL-NEXT:    andb $1, %cl
-; KNL-NEXT:    je LBB23_30
-; KNL-NEXT:  ## BB#29:
-; KNL-NEXT:    movb $-1, %cl
-; KNL-NEXT:  LBB23_30:
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; KNL-NEXT:    movl $275, %eax ## imm = 0x113
-; KNL-NEXT:    bextrl %eax, %edi, %r11d
-; KNL-NEXT:    movzbl %dl, %edx
-; KNL-NEXT:    vmovd %esi, %xmm3
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $31, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_32
-; KNL-NEXT:  ## BB#31:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_32:
-; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT:    movl $276, %eax ## imm = 0x114
-; KNL-NEXT:    bextrl %eax, %edi, %esi
-; KNL-NEXT:    movl $277, %r11d ## imm = 0x115
-; KNL-NEXT:    vpinsrb $1, %edx, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r10b, %r10d
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_34
-; KNL-NEXT:  ## BB#33:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_34:
-; KNL-NEXT:    vpinsrb $4, %esi, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %r11d, %edi, %edx
-; KNL-NEXT:    movl $278, %r11d ## imm = 0x116
-; KNL-NEXT:    vpinsrb $2, %r10d, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r9b, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    movq %r15, %rcx
-; KNL-NEXT:    shlq $63, %rcx
-; KNL-NEXT:    sarq $63, %rcx
-; KNL-NEXT:    vmovd %ecx, %xmm4
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb $2, %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_36
-; KNL-NEXT:  ## BB#35:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_36:
-; KNL-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %r11d, %edi, %edx
-; KNL-NEXT:    movl $279, %r9d ## imm = 0x117
-; KNL-NEXT:    vpinsrb $3, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %bl, %ebx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb $3, %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_38
-; KNL-NEXT:  ## BB#37:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_38:
-; KNL-NEXT:    vpinsrb $6, %edx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %r9d, %edi, %edx
-; KNL-NEXT:    movl $280, %esi ## imm = 0x118
-; KNL-NEXT:    vpinsrb $4, %ebx, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r12b, %ebx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb $4, %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_40
-; KNL-NEXT:  ## BB#39:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_40:
-; KNL-NEXT:    vpinsrb $7, %edx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %esi, %edi, %ecx
-; KNL-NEXT:    movl $281, %edx ## imm = 0x119
-; KNL-NEXT:    vpinsrb $5, %ebx, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r14b, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %al
-; KNL-NEXT:    shrb $5, %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_42
-; KNL-NEXT:  ## BB#41:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_42:
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %ecx
-; KNL-NEXT:    movl $282, %edx ## imm = 0x11A
-; KNL-NEXT:    vpinsrb $6, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r8b, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %bl
-; KNL-NEXT:    shrb $6, %bl
-; KNL-NEXT:    andb $1, %bl
-; KNL-NEXT:    je LBB23_44
-; KNL-NEXT:  ## BB#43:
-; KNL-NEXT:    movb $-1, %bl
-; KNL-NEXT:  LBB23_44:
-; KNL-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %eax
-; KNL-NEXT:    movl $283, %ecx ## imm = 0x11B
-; KNL-NEXT:    vpinsrb $7, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movzbl %r13b, %esi
-; KNL-NEXT:    movzbl %bl, %edx
-; KNL-NEXT:    vpinsrb $6, %edx, %xmm4, %xmm4
-; KNL-NEXT:    movb %r15b, %bl
-; KNL-NEXT:    shrb $7, %bl
-; KNL-NEXT:    je LBB23_46
-; KNL-NEXT:  ## BB#45:
-; KNL-NEXT:    movb $-1, %bl
-; KNL-NEXT:  LBB23_46:
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %ecx, %edi, %ecx
-; KNL-NEXT:    movl $284, %edx ## imm = 0x11C
-; KNL-NEXT:    vpinsrb $8, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
-; KNL-NEXT:    movzbl %al, %esi
-; KNL-NEXT:    movzbl %bl, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $8, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_48
-; KNL-NEXT:  ## BB#47:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_48:
-; KNL-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %ecx
-; KNL-NEXT:    movl $285, %edx ## imm = 0x11D
-; KNL-NEXT:    vpinsrb $9, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT:    movzbl %sil, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $9, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_50
-; KNL-NEXT:  ## BB#49:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_50:
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %ecx
-; KNL-NEXT:    movl $286, %edx ## imm = 0x11E
-; KNL-NEXT:    vpinsrb $10, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT:    movzbl %sil, %esi
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $10, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_52
-; KNL-NEXT:  ## BB#51:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_52:
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    bextrl %edx, %edi, %edx
-; KNL-NEXT:    vpinsrb $11, %esi, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT:    movzbl %cl, %ecx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $11, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_54
-; KNL-NEXT:  ## BB#53:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_54:
-; KNL-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
-; KNL-NEXT:    shrl $31, %edi
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT:    movzbl %cl, %ecx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $12, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_56
-; KNL-NEXT:  ## BB#55:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_56:
-; KNL-NEXT:    vpinsrb $15, %edi, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm3, %xmm3
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT:    movzbl %cl, %ecx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $13, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_58
-; KNL-NEXT:  ## BB#57:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_58:
-; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT:    vpinsrb $14, %ecx, %xmm3, %xmm2
-; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT:    movzbl %cl, %ecx
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm3
-; KNL-NEXT:    movq %r15, %rax
-; KNL-NEXT:    shrq $14, %rax
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    je LBB23_60
-; KNL-NEXT:  ## BB#59:
-; KNL-NEXT:    movb $-1, %al
-; KNL-NEXT:  LBB23_60:
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; KNL-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm1
-; KNL-NEXT:    movzbl %al, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm2
-; KNL-NEXT:    shrq $15, %r15
-; KNL-NEXT:    andb $1, %r15b
-; KNL-NEXT:    je LBB23_62
-; KNL-NEXT:  ## BB#61:
-; KNL-NEXT:    movb $-1, %r15b
-; KNL-NEXT:  LBB23_62:
-; KNL-NEXT:    movzbl %r15b, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
-; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
-; KNL-NEXT:    leaq -40(%rbp), %rsp
-; KNL-NEXT:    popq %rbx
-; KNL-NEXT:    popq %r12
-; KNL-NEXT:    popq %r13
-; KNL-NEXT:    popq %r14
-; KNL-NEXT:    popq %r15
-; KNL-NEXT:    popq %rbp
-; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test17:
 ; SKX:       ## BB#0:
@@ -1790,3 +914,127 @@ L2:
 End:
   ret void
 }
+
+define <8 x i64> @load_8i1(<8 x i1>* %a) {
+; KNL-LABEL: load_8i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movzbw (%rdi), %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: load_8i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb (%rdi), %k0
+; SKX-NEXT:    vpmovm2q %k0, %zmm0
+; SKX-NEXT:    retq
+  %b = load <8 x i1>, <8 x i1>* %a
+  %c = sext <8 x i1> %b to <8 x i64>
+  ret <8 x i64> %c
+}
+
+define <16 x i32> @load_16i1(<16 x i1>* %a) {
+; KNL-LABEL: load_16i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw (%rdi), %k1
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: load_16i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovw (%rdi), %k0
+; SKX-NEXT:    vpmovm2d %k0, %zmm0
+; SKX-NEXT:    retq
+  %b = load <16 x i1>, <16 x i1>* %a
+  %c = sext <16 x i1> %b to <16 x i32>
+  ret <16 x i32> %c
+}
+
+define <2 x i16> @load_2i1(<2 x i1>* %a) {
+; KNL-LABEL: load_2i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movb (%rdi), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: load_2i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb (%rdi), %k0
+; SKX-NEXT:    vpmovm2q %k0, %xmm0
+; SKX-NEXT:    retq
+  %b = load <2 x i1>, <2 x i1>* %a
+  %c = sext <2 x i1> %b to <2 x i16>
+  ret <2 x i16> %c
+}
+
+define <4 x i16> @load_4i1(<4 x i1>* %a) {
+; KNL-LABEL: load_4i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movb (%rdi), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: load_4i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb (%rdi), %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    retq
+  %b = load <4 x i1>, <4 x i1>* %a
+  %c = sext <4 x i1> %b to <4 x i16>
+  ret <4 x i16> %c
+}
+
+define <32 x i16> @load_32i1(<32 x i1>* %a) {
+; KNL-LABEL: load_32i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw (%rdi), %k1
+; KNL-NEXT:    movl {{.*}}(%rip), %eax
+; KNL-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    kmovw 2(%rdi), %k1
+; KNL-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpmovdw %zmm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: load_32i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd (%rdi), %k0
+; SKX-NEXT:    vpmovm2w %k0, %zmm0
+; SKX-NEXT:    retq
+  %b = load <32 x i1>, <32 x i1>* %a
+  %c = sext <32 x i1> %b to <32 x i16>
+  ret <32 x i16> %c
+}
+
+define <64 x i8> @load_64i1(<64 x i1>* %a) {
+; KNL-LABEL: load_64i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw (%rdi), %k1
+; KNL-NEXT:    movl {{.*}}(%rip), %eax
+; KNL-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    kmovw 2(%rdi), %k1
+; KNL-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm1, %xmm1
+; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    kmovw 4(%rdi), %k1
+; KNL-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm1, %xmm1
+; KNL-NEXT:    kmovw 6(%rdi), %k1
+; KNL-NEXT:    vpbroadcastd %eax, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm2, %xmm2
+; KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: load_64i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovq (%rdi), %k0
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
+; SKX-NEXT:    retq
+  %b = load <64 x i1>, <64 x i1>* %a
+  %c = sext <64 x i1> %b to <64 x i8>
+  ret <64 x i8> %c
+}

Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=265259&r1=265258&r2=265259&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Sun Apr  3 03:41:12 2016
@@ -291,7 +291,8 @@ define <8 x i32> @test7(i32* %base, <8 x
 ; KNL_32-LABEL: test7:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; KNL_32-NEXT:    kmovw %ecx, %k1
 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_32-NEXT:    kmovw %k1, %k2
 ; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll?rev=265259&r1=265258&r2=265259&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll Sun Apr  3 03:41:12 2016
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
@@ -399,34 +400,17 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0
 }
 
 define i64 @shuf64i1_zero(i64 %a) {
-; AVX512F-LABEL: shuf64i1_zero:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:  .Ltmp0:
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:  .Ltmp1:
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:  .Ltmp2:
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $32, %rsp
-; AVX512F-NEXT:    movb $0, (%rsp)
-; AVX512F-NEXT:    movl (%rsp), %ecx
-; AVX512F-NEXT:    movq %rcx, %rax
-; AVX512F-NEXT:    shlq $32, %rax
-; AVX512F-NEXT:    orq %rcx, %rax
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
-; AVX512F-NEXT:    retq
-;
 ; VL_BW_DQ-LABEL: shuf64i1_zero:
 ; VL_BW_DQ:       # BB#0:
-; VL_BW_DQ-NEXT:    kxorq %k0, %k0, %k0
+; VL_BW_DQ-NEXT:    kmovq %rdi, %k0
+; VL_BW_DQ-NEXT:    vpmovm2b %k0, %zmm0
+; VL_BW_DQ-NEXT:    vpbroadcastb %xmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpsllw $7, %zmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovb2m %zmm0, %k0
 ; VL_BW_DQ-NEXT:    kmovq %k0, %rax
 ; VL_BW_DQ-NEXT:    retq
   %b = bitcast i64 %a to <64 x i1>
-  %c = shufflevector < 64 x i1> zeroinitializer, <64 x i1> undef, <64 x i32> zeroinitializer
+  %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
   %d = bitcast <64 x i1> %c to i64
   ret i64 %d
 }




More information about the llvm-commits mailing list