[llvm] r279782 - Revert r274613 because it breaks the test suite with AVX512

Michael Kuperstein via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 25 14:55:41 PDT 2016


Author: mkuper
Date: Thu Aug 25 16:55:41 2016
New Revision: 279782

URL: http://llvm.org/viewvc/llvm-project?rev=279782&view=rev
Log:
Revert r274613 because it breaks the test suite with AVX512

This reverts most of r274613 and its follow-ups (r276347, r277289), due to
miscompiles in the test suite. The FastISel change was left in, because it
apparently fixes an unrelated issue.

This fixes 4 out of the 5 test failures in PR29112.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/test/CodeGen/X86/avx512-cmp.ll
    llvm/trunk/test/CodeGen/X86/avx512-ext.ll
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/fast-isel-select-cmov.ll
    llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/trunk/test/CodeGen/X86/masked_memop.ll
    llvm/trunk/test/CodeGen/X86/pr27591.ll
    llvm/trunk/test/CodeGen/X86/pr28173.ll
    llvm/trunk/test/CodeGen/X86/xaluo.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Aug 25 16:55:41 2016
@@ -15998,7 +15998,7 @@ SDValue X86TargetLowering::LowerSETCCE(S
   SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                               DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
   if (Op.getSimpleValueType() == MVT::i1)
-    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
   return SetCC;
 }
 
@@ -16028,18 +16028,14 @@ static bool isX86LogicalCmp(SDValue Op)
   return false;
 }
 
-/// Returns the "condition" node, that may be wrapped with "truncate".
-/// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
-static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   if (V.getOpcode() != ISD::TRUNCATE)
-    return V;
+    return false;
 
   SDValue VOp0 = V.getOperand(0);
   unsigned InBits = VOp0.getValueSizeInBits();
   unsigned Bits = V.getValueSizeInBits();
-  if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
-    return V.getOperand(0);
-  return V;
+  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
 }
 
 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -16262,7 +16258,8 @@ SDValue X86TargetLowering::LowerSELECT(S
 
   if (addTest) {
     // Look past the truncate if the high bits are known zero.
-    Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+      Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
@@ -17098,7 +17095,8 @@ SDValue X86TargetLowering::LowerBRCOND(S
 
   if (addTest) {
     // Look pass the truncate if the high bits are known zero.
-    Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
 
     // We know the result is compared against zero. Try to match it to BT.
     if (Cond.hasOneUse()) {
@@ -18349,7 +18347,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(S
     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
-    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
@@ -20891,12 +20889,10 @@ static SDValue LowerXALUO(SDValue Op, Se
     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
 
     SDValue SetCC =
-      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+      DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
                   DAG.getConstant(X86::COND_O, DL, MVT::i32),
                   SDValue(Sum.getNode(), 2));
 
-    if (N->getValueType(1) == MVT::i1)
-      SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   }
   }
@@ -20906,12 +20902,15 @@ static SDValue LowerXALUO(SDValue Op, Se
   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
 
   SDValue SetCC =
-    DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
                 DAG.getConstant(Cond, DL, MVT::i32),
                 SDValue(Sum.getNode(), 1));
 
-  if (N->getValueType(1) == MVT::i1)
+  if (N->getValueType(1) == MVT::i1) {
+    SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
+                        DAG.getValueType(MVT::i1));
     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+  }
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
@@ -30968,12 +30967,18 @@ static SDValue combineGatherScatter(SDNo
 // as "sbb reg,reg", since it can be extended without zext and produces
 // an all-ones bit which is more useful than 0/1 in some cases.
 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
-                               SelectionDAG &DAG) {
-  return DAG.getNode(ISD::AND, DL, MVT::i8,
+                               SelectionDAG &DAG, MVT VT) {
+  if (VT == MVT::i8)
+    return DAG.getNode(ISD::AND, DL, VT,
+                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                   EFLAGS),
+                       DAG.getConstant(1, DL, VT));
+  assert (VT == MVT::i1 && "Unexpected type for SECCC node");
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                 EFLAGS),
-                     DAG.getConstant(1, DL, MVT::i8));
+                                 EFLAGS));
 }
 
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
@@ -30998,7 +31003,7 @@ static SDValue combineX86SetCC(SDNode *N
                                    EFLAGS.getNode()->getVTList(),
                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
-      return MaterializeSETB(DL, NewEFLAGS, DAG);
+      return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
     }
   }
 
@@ -31006,7 +31011,7 @@ static SDValue combineX86SetCC(SDNode *N
   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   // cases.
   if (CC == X86::COND_B)
-    return MaterializeSETB(DL, EFLAGS, DAG);
+    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
 
   // Try to simplify the EFLAGS and condition code operands.
   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Thu Aug 25 16:55:41 2016
@@ -2170,91 +2170,51 @@ let Predicates = [HasBWI] in {
             (KMOVQkm addr:$src)>;
 }
 
-def assertzext_i1 : PatFrag<(ops node:$src), (assertzext node:$src), [{
-  return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1;
-}]>;
-
-def trunc_setcc : PatFrag<(ops node:$src), (trunc node:$src), [{
-  return (N->getOperand(0)->getOpcode() == X86ISD::SETCC);
-}]>;
-
-def trunc_mask_1 : PatFrag<(ops node:$src), (trunc node:$src), [{
-  return (N->getOperand(0)->getOpcode() == ISD::AND &&
-          isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)) &&
-          N->getOperand(0)->getConstantOperandVal(1) == 1);
-}]>;
-
-
 let Predicates = [HasAVX512] in {
   def : Pat<(i1 (trunc (i64 GR64:$src))),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND64ri8 $src, (i64 1)),
-                                    sub_16bit)), VK1)>;
-
-  def : Pat<(i1 (trunc (i64 (assertzext_i1 GR64:$src)))),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
-
-  def : Pat<(i1 (trunc_mask_1 GR64:$src)),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
+                                        (i32 1))), VK1)>;
 
   def : Pat<(i1 (trunc (i32 GR32:$src))),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND32ri8 $src, (i32 1)),
-                                    sub_16bit)), VK1)>;
-
-  def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
-
-  def : Pat<(i1 (trunc_mask_1 GR32:$src)),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>;
 
   def : Pat<(i1 (trunc (i8 GR8:$src))),
-            (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), (AND8ri $src, (i8 1)),
-                                    sub_8bit)), VK1)>;
-
-  def : Pat<(i1 (trunc (i8 (assertzext_i1 GR8:$src)))),
-            (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>;
-
-  def : Pat<(i1 (trunc_setcc GR8:$src)),
-            (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>;
-
-  def : Pat<(i1 (trunc_mask_1 GR8:$src)),
-            (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>;
-
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))),
+       VK1)>;
   def : Pat<(i1 (trunc (i16 GR16:$src))),
-            (COPY_TO_REGCLASS (AND16ri GR16:$src, (i16 1)), VK1)>;
-
-  def : Pat<(i1 (trunc (i16 (assertzext_i1 GR16:$src)))),
-            (COPY_TO_REGCLASS $src, VK1)>;
-
-  def : Pat<(i1 (trunc_mask_1 GR16:$src)),
-            (COPY_TO_REGCLASS $src, VK1)>;
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))),
+       VK1)>;
 
   def : Pat<(i32 (zext VK1:$src)),
-            (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
-                  sub_16bit))>;
-
+            (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
   def : Pat<(i32 (anyext VK1:$src)),
-            (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
-                  sub_16bit))>;
+            (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>;
 
   def : Pat<(i8 (zext VK1:$src)),
-            (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS VK1:$src, GR16)), sub_8bit))>;
-
+            (EXTRACT_SUBREG
+             (AND32ri8 (KMOVWrk
+                        (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
   def : Pat<(i8 (anyext VK1:$src)),
-            (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS $src, GR16)), sub_8bit))>;
+              (EXTRACT_SUBREG
+                (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>;
 
   def : Pat<(i64 (zext VK1:$src)),
-            (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
-                  sub_16bit))>;
-
+            (AND64ri8 (SUBREG_TO_REG (i64 0),
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
   def : Pat<(i64 (anyext VK1:$src)),
-            (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
-                  sub_16bit))>;
+            (SUBREG_TO_REG (i64 0),
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit)>;
 
   def : Pat<(i16 (zext VK1:$src)),
-            (COPY_TO_REGCLASS $src, GR16)>;
-
+            (EXTRACT_SUBREG
+             (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
+              sub_16bit)>;
   def : Pat<(i16 (anyext VK1:$src)),
-            (i16 (COPY_TO_REGCLASS $src, GR16))>;
+            (EXTRACT_SUBREG
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
+              sub_16bit)>;
 }
 def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
           (COPY_TO_REGCLASS VK1:$src, VK16)>;

Modified: llvm/trunk/test/CodeGen/X86/avx512-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cmp.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cmp.ll Thu Aug 25 16:55:41 2016
@@ -163,10 +163,12 @@ define i32 @test10(i64 %b, i64 %c, i1 %d
 ; ALL-NEXT:    kmovw %edx, %k0
 ; ALL-NEXT:    cmpq %rsi, %rdi
 ; ALL-NEXT:    sete %al
+; ALL-NEXT:    andl $1, %eax
 ; ALL-NEXT:    kmovw %eax, %k1
 ; ALL-NEXT:    korw %k1, %k0, %k1
 ; ALL-NEXT:    kxorw %k1, %k0, %k0
 ; ALL-NEXT:    kmovw %k0, %eax
+; ALL-NEXT:    andl $1, %eax
 ; ALL-NEXT:    testb %al, %al
 ; ALL-NEXT:    je LBB8_1
 ; ALL-NEXT:  ## BB#2: ## %if.end.i

Modified: llvm/trunk/test/CodeGen/X86/avx512-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-ext.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-ext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-ext.ll Thu Aug 25 16:55:41 2016
@@ -1525,260 +1525,260 @@ define <64 x i16> @test21(<64 x i16> %x
 ; KNL-NEXT:    vptestmd %zmm4, %zmm4, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    kmovw %k1, %r13d
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r12d
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %r11d
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    kmovw %k1, %r8d
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT:    kmovw %k1, %edi
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edi
+; KNL-NEXT:    kmovw %k1, %esi
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    kmovw %k1, %ebx
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    kmovw %k1, %ebp
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    kmovw %k1, %r14d
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    kmovw %k1, %r15d
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebx
+; KNL-NEXT:    kmovw %k1, %r12d
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebp
+; KNL-NEXT:    kmovw %k1, %r10d
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r14d
+; KNL-NEXT:    kmovw %k1, %r9d
 ; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k1
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %r15d, %xmm4
-; KNL-NEXT:    kmovw %k0, %r15d
+; KNL-NEXT:    vmovd %r13d, %xmm4
+; KNL-NEXT:    kmovw %k0, %r13d
 ; KNL-NEXT:    kshiftlw $14, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $15, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $2, %r12d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $13, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $3, %edx, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    vpinsrb $3, %r11d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r11d
 ; KNL-NEXT:    kshiftlw $12, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $4, %r13d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    vpinsrb $4, %r8d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r8d
 ; KNL-NEXT:    kshiftlw $11, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
-; KNL-NEXT:    kmovw %k0, %r13d
+; KNL-NEXT:    vpinsrb $5, %edi, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %edi
+; KNL-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
 ; KNL-NEXT:    kshiftlw $10, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $6, %esi, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %esi
-; KNL-NEXT:    movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT:    vpinsrb $6, %edx, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $9, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $7, %edi, %xmm4, %xmm4
+; KNL-NEXT:    vpinsrb $7, %esi, %xmm4, %xmm4
 ; KNL-NEXT:    kmovw %k0, %esi
 ; KNL-NEXT:    kshiftlw $8, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $8, %r8d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %edi
+; KNL-NEXT:    vpinsrb $8, %ebx, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %ebx
 ; KNL-NEXT:    kshiftlw $7, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $9, %r9d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r8d
+; KNL-NEXT:    vpinsrb $9, %ebp, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %ebp
 ; KNL-NEXT:    kshiftlw $6, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $10, %r10d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    vpinsrb $10, %r14d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r14d
 ; KNL-NEXT:    kshiftlw $5, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $11, %r11d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    vpinsrb $11, %r15d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r15d
 ; KNL-NEXT:    kshiftlw $4, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $12, %ebx, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %ebx
+; KNL-NEXT:    vpinsrb $12, %r12d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %edi
 ; KNL-NEXT:    kshiftlw $3, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $13, %ebp, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %ebp
+; KNL-NEXT:    vpinsrb $13, %r10d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r10d
 ; KNL-NEXT:    kshiftlw $2, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $14, %r14d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r11d
+; KNL-NEXT:    vpinsrb $14, %r9d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r9d
 ; KNL-NEXT:    kshiftlw $1, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $15, %r15d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r14d
+; KNL-NEXT:    vpinsrb $15, %r13d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r12d
 ; KNL-NEXT:    vptestmd %zmm6, %zmm6, %k0
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vmovd %eax, %xmm5
-; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    vmovd %ecx, %xmm5
+; KNL-NEXT:    kmovw %k1, %r13d
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $2, %r12d, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $2, %r11d, %xmm5, %xmm5
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $3, %edx, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %r12d
+; KNL-NEXT:    vpinsrb $3, %r8d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $4, %r13d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
-; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    vpinsrb $5, %edx, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r8d
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    vpinsrb $6, %esi, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %esi
-; KNL-NEXT:    movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $7, %edi, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $7, %ebx, %xmm5, %xmm5
 ; KNL-NEXT:    kmovw %k1, %esi
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $8, %r8d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %edi
+; KNL-NEXT:    vpinsrb $8, %ebp, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %ebp
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $9, %r9d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    vpinsrb $9, %r14d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %ebx
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $10, %r10d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    vpinsrb $10, %r15d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r11d
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $11, %ebx, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %ebx
+; KNL-NEXT:    vpinsrb $11, %edi, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %edi
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $12, %ebp, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %ebp
+; KNL-NEXT:    vpinsrb $12, %r10d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r10d
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $13, %r11d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    vpinsrb $13, %r9d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r9d
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $14, %r14d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    vpinsrb $14, %r12d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r14d
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $15, %r15d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k1, %r14d
+; KNL-NEXT:    vpinsrb $15, %r13d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r15d
 ; KNL-NEXT:    vptestmd %zmm7, %zmm7, %k1
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    vmovd %eax, %xmm6
-; KNL-NEXT:    kmovw %k0, %r15d
+; KNL-NEXT:    kmovw %k0, %r12d
 ; KNL-NEXT:    kshiftlw $14, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k0, %r13d
 ; KNL-NEXT:    kshiftlw $15, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $2, %r12d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $13, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $3, %edx, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $12, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $4, %r13d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %r13d
+; KNL-NEXT:    vpinsrb $4, %r8d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r8d
 ; KNL-NEXT:    kshiftlw $11, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $5, %edx, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $10, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    vpinsrb $6, %esi, %xmm6, %xmm6
 ; KNL-NEXT:    kmovw %k0, %esi
 ; KNL-NEXT:    kshiftlw $9, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $7, %edi, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %edi
+; KNL-NEXT:    vpinsrb $7, %ebp, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %ebp
 ; KNL-NEXT:    kshiftlw $8, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $8, %r8d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %r8d
+; KNL-NEXT:    vpinsrb $8, %ebx, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %ebx
 ; KNL-NEXT:    kshiftlw $7, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $9, %r9d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    vpinsrb $9, %r11d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r11d
 ; KNL-NEXT:    kshiftlw $6, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $10, %ebx, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %ebx
+; KNL-NEXT:    vpinsrb $10, %edi, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %edi
 ; KNL-NEXT:    kshiftlw $5, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $11, %ebp, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %ebp
+; KNL-NEXT:    vpinsrb $11, %r10d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r10d
 ; KNL-NEXT:    kshiftlw $4, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $12, %r10d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    vpinsrb $12, %r9d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r9d
 ; KNL-NEXT:    kshiftlw $3, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $13, %r11d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %r11d
+; KNL-NEXT:    vpinsrb $13, %r14d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r14d
 ; KNL-NEXT:    kshiftlw $2, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $14, %r14d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %r14d
+; KNL-NEXT:    vpinsrb $14, %r15d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r15d
 ; KNL-NEXT:    kshiftlw $1, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $15, %r15d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k0, %r15d
-; KNL-NEXT:    kshiftrw $15, %k1, %k0
-; KNL-NEXT:    vmovd %r12d, %xmm7
+; KNL-NEXT:    vpinsrb $15, %r12d, %xmm6, %xmm6
 ; KNL-NEXT:    kmovw %k0, %r12d
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $2, %edx, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $3, %r13d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm7, %xmm7
+; KNL-NEXT:    kshiftrw $15, %k1, %k0
+; KNL-NEXT:    vmovd %eax, %xmm7
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $1, %r13d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $3, %r8d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $4, %edx, %xmm7, %xmm7
 ; KNL-NEXT:    vpinsrb $5, %esi, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $6, %edi, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $7, %r8d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $8, %r9d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $9, %ebx, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $10, %ebp, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $11, %r10d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $12, %r11d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $13, %r14d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $6, %ebp, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $7, %ebx, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $8, %r11d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $9, %edi, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $10, %r10d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $11, %r9d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $12, %r14d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $13, %r15d, %xmm7, %xmm7
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
 ; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4
@@ -1791,8 +1791,8 @@ define <64 x i16> @test21(<64 x i16> %x
 ; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
 ; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4
 ; KNL-NEXT:    vpand %ymm2, %ymm4, %ymm2
-; KNL-NEXT:    vpinsrb $14, %r15d, %xmm7, %xmm4
-; KNL-NEXT:    vpinsrb $15, %r12d, %xmm4, %xmm4
+; KNL-NEXT:    vpinsrb $14, %r12d, %xmm7, %xmm4
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
 ; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Thu Aug 25 16:55:41 2016
@@ -201,6 +201,7 @@ define <16 x i32> @test11(<16 x i32>%a,
 ; KNL-NEXT:    kshiftlw $11, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    testb %al, %al
 ; KNL-NEXT:    je LBB10_2
 ; KNL-NEXT:  ## BB#1: ## %A
@@ -216,6 +217,7 @@ define <16 x i32> @test11(<16 x i32>%a,
 ; SKX-NEXT:    kshiftlw $11, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    je LBB10_2
 ; SKX-NEXT:  ## BB#1: ## %A
@@ -243,6 +245,7 @@ define i64 @test12(<16 x i64>%a, <16 x i
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    testb %al, %al
 ; KNL-NEXT:    cmoveq %rsi, %rdi
 ; KNL-NEXT:    movq %rdi, %rax
@@ -256,6 +259,7 @@ define i64 @test12(<16 x i64>%a, <16 x i
 ; SKX-NEXT:    kshiftlw $15, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    cmoveq %rsi, %rdi
 ; SKX-NEXT:    movq %rdi, %rax
@@ -271,6 +275,7 @@ define i16 @test13(i32 %a, i32 %b) {
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    cmpl %esi, %edi
 ; KNL-NEXT:    setb %al
+; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k0
 ; KNL-NEXT:    movw $-4, %ax
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -284,6 +289,7 @@ define i16 @test13(i32 %a, i32 %b) {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    kmovw %eax, %k0
 ; SKX-NEXT:    movw $-4, %ax
 ; SKX-NEXT:    kmovw %eax, %k1
@@ -305,6 +311,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64
 ; KNL-NEXT:    kshiftlw $11, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    testb %al, %al
 ; KNL-NEXT:    cmoveq %rsi, %rdi
 ; KNL-NEXT:    movq %rdi, %rax
@@ -316,6 +323,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64
 ; SKX-NEXT:    kshiftlb $3, %k0, %k0
 ; SKX-NEXT:    kshiftrb $7, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    cmoveq %rsi, %rdi
 ; SKX-NEXT:    movq %rdi, %rax
@@ -1033,10 +1041,175 @@ define <32 x i8> @test_insert_128_v32i8(
 }
 
 define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) {
+; KNL-LABEL: test_insertelement_v32i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Ltmp0:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Ltmp1:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Ltmp2:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $32, %rsp
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $15, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vmovd %ecx, %xmm1
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $13, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $12, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $11, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $10, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $9, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $8, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $7, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $6, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $5, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $4, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $3, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $2, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftlw $1, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $15, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vmovd %ecx, %xmm0
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $13, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $12, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $11, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $10, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $9, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $8, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $7, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $6, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $5, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $4, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $3, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $2, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftlw $1, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    sbbl %eax, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, (%rsp)
+; KNL-NEXT:    movl (%rsp), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: test_insertelement_v32i1:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    kmovw %eax, %k0
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k1
 ; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k2
@@ -1056,10 +1229,52 @@ define i32 @test_insertelement_v32i1(i32
 }
 
 define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) {
+; KNL-LABEL: test_iinsertelement_v4i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpextrd $1, %xmm0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2} {z}
+; KNL-NEXT:    vmovd %xmm0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2} {z}
+; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7]
+; KNL-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
+; KNL-NEXT:    vpsllq $63, %zmm3, %zmm2
+; KNL-NEXT:    vptestmq %zmm2, %zmm2, %k2
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2} {z}
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1} {z}
+; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7]
+; KNL-NEXT:    vpermt2q %zmm3, %zmm4, %zmm2
+; KNL-NEXT:    vpsllq $63, %zmm2, %zmm2
+; KNL-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpextrd $3, %xmm0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7]
+; KNL-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: test_iinsertelement_v4i1:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    kmovw %eax, %k0
 ; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k1
 ; SKX-NEXT:    vpmovm2d %k1, %xmm0
@@ -1078,10 +1293,34 @@ define i8 @test_iinsertelement_v4i1(i32
 }
 
 define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) {
+; KNL-LABEL: test_iinsertelement_v2i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2} {z}
+; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; KNL-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; KNL-NEXT:    vpsllq $63, %zmm1, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: test_iinsertelement_v2i1:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    kmovw %eax, %k0
 ; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
 ; SKX-NEXT:    kshiftlw $1, %k1, %k1
@@ -1118,6 +1357,7 @@ define zeroext i8 @test_extractelement_v
 ; SKX-NEXT:    kshiftlw $15, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    addb $3, %al
@@ -1146,6 +1386,7 @@ define zeroext i8 @test_extractelement_v
 ; SKX-NEXT:    kshiftlw $12, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <4 x i32> %a, %b
   %t2 = extractelement <4 x i1> %t1, i32 3
@@ -1170,6 +1411,7 @@ define zeroext i8 @test_extractelement_v
 ; SKX-NEXT:    kshiftld $29, %k0, %k0
 ; SKX-NEXT:    kshiftrd $31, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <32 x i8> %a, %b
   %t2 = extractelement <32 x i1> %t1, i32 2
@@ -1197,6 +1439,7 @@ define zeroext i8 @test_extractelement_v
 ; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
 ; SKX-NEXT:    kshiftrq $63, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    addb $3, %al

Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Thu Aug 25 16:55:41 2016
@@ -7,9 +7,11 @@ define i32 @test_kortestz(i16 %a0, i16 %
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k0
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    kortestw %k0, %k1
 ; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    kmovw %eax, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
   ret i32 %res
@@ -4765,7 +4767,8 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
 
   %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
@@ -4786,7 +4789,8 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    kandw %k2, %k1, %k1
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
 
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
@@ -4809,7 +4813,8 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmpunordss %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
 
   %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
@@ -4827,11 +4832,12 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k2 {%k1}
 ; CHECK-NEXT:    kmovw %k2, %ecx
 ; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    kmovw %k1, %edx
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    andb %cl, %al
 ; CHECK-NEXT:    andb %dl, %al
-; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
   %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Thu Aug 25 16:55:41 2016
@@ -173,6 +173,7 @@ define i32 @zext_test1(<16 x i32> %a, <1
 ; CHECK-NEXT:    kshiftlw $10, %k0, %k0
 ; CHECK-NEXT:    kshiftrw $15, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@@ -187,6 +188,8 @@ define i16 @zext_test2(<16 x i32> %a, <1
 ; CHECK-NEXT:    kshiftlw $10, %k0, %k0
 ; CHECK-NEXT:    kshiftrw $15, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@@ -201,7 +204,8 @@ define i8 @zext_test3(<16 x i32> %a, <16
 ; CHECK-NEXT:    kshiftlw $10, %k0, %k0
 ; CHECK-NEXT:    kshiftrw $15, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@@ -606,6 +610,7 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 ; SKX-NEXT:    kmovq %rdi, %k0
 ; SKX-NEXT:    cmpl %edx, %esi
 ; SKX-NEXT:    setg %al
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    kmovw %eax, %k1
 ; SKX-NEXT:    vpmovm2b %k1, %zmm0
 ; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0
@@ -1760,7 +1765,7 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, %r13d
 ; KNL-NEXT:    kshiftlw $7, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $6, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %esi
@@ -1775,7 +1780,7 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $2, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $1, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    vmovd %r10d, %xmm2
@@ -1789,12 +1794,12 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    vpinsrb $5, %r15d, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $6, %r12d, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $7, %r13d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $8, %edx, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $9, %esi, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $10, %ebp, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $11, %ebx, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $14, %r10d, %xmm1, %xmm1
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
@@ -1828,7 +1833,7 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, %r13d
 ; KNL-NEXT:    kshiftlw $7, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $6, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %esi
@@ -1843,7 +1848,7 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $2, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $1, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    vmovd %r10d, %xmm1
@@ -1857,12 +1862,12 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $8, %edx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $14, %r10d, %xmm0, %xmm0
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0

Modified: llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll Thu Aug 25 16:55:41 2016
@@ -491,10 +491,12 @@ define i8 @test_int_x86_avx512_mask_fpcl
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfpclasssd $2, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    vfpclasssd $4, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    addb %cl, %al
-; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1)
   %res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1)
@@ -511,10 +513,12 @@ define i8 @test_int_x86_avx512_mask_fpcl
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfpclassss $4, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    vfpclassss $4, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    addb %cl, %al
-; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1)
   %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1)

Modified: llvm/trunk/test/CodeGen/X86/fast-isel-select-cmov.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fast-isel-select-cmov.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fast-isel-select-cmov.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fast-isel-select-cmov.ll Thu Aug 25 16:55:41 2016
@@ -15,6 +15,7 @@ define zeroext i16 @select_cmov_i16(i1 z
 ;
 ; AVX512-LABEL: select_cmov_i16:
 ; AVX512:       ## BB#0:
+; AVX512-NEXT:    andl $1, %edi
 ; AVX512-NEXT:    kmovw %edi, %k0
 ; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    cmovew %dx, %si
@@ -46,6 +47,7 @@ define i32 @select_cmov_i32(i1 zeroext %
 ;
 ; AVX512-LABEL: select_cmov_i32:
 ; AVX512:       ## BB#0:
+; AVX512-NEXT:    andl $1, %edi
 ; AVX512-NEXT:    kmovw %edi, %k0
 ; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    cmovel %edx, %esi
@@ -77,6 +79,7 @@ define i64 @select_cmov_i64(i1 zeroext %
 ;
 ; AVX512-LABEL: select_cmov_i64:
 ; AVX512:       ## BB#0:
+; AVX512-NEXT:    andl $1, %edi
 ; AVX512-NEXT:    kmovw %edi, %k0
 ; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    cmoveq %rdx, %rsi

Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Thu Aug 25 16:55:41 2016
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
 ; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
@@ -38,6 +39,14 @@ define <16 x float> @test1(float* %base,
 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test1:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
+; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
@@ -87,6 +96,14 @@ define <16 x float> @test2(float* %base,
 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test2:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
+; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
@@ -120,6 +137,14 @@ define <16 x i32> @test3(i32* %base, <16
 ; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test3:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
@@ -163,6 +188,17 @@ define <16 x i32> @test4(i32* %base, <16
 ; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
 ; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test4:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT:    kmovw %k1, %k2
+; SKX_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
+; SKX_32-NEXT:    vmovdqa64 %zmm1, %zmm2
+; SKX_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
+; SKX_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
+; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
@@ -215,6 +251,15 @@ define void @test5(i32* %base, <16 x i32
 ; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
 ; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test5:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT:    kmovw %k1, %k2
+; SKX_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
+; SKX_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
@@ -267,6 +312,15 @@ define <8 x i32> @test6(<8 x i32>%a1, <8
 ; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
 ; SKX-NEXT:    vmovdqa64 %ymm2, %ymm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test6:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k2
+; SKX_32-NEXT:    vpgatherdd (,%ymm1), %ymm2 {%k2}
+; SKX_32-NEXT:    vpscatterdd %ymm0, (,%ymm1) {%k1}
+; SKX_32-NEXT:    vmovdqa64 %ymm2, %ymm0
+; SKX_32-NEXT:    retl
 
   %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
 
@@ -309,6 +363,17 @@ define <8 x i32> @test7(i32* %base, <8 x
 ; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
 ; SKX-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test7:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT:    kmovw %k1, %k2
+; SKX_32-NEXT:    vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
+; SKX_32-NEXT:    vmovdqa64 %ymm1, %ymm2
+; SKX_32-NEXT:    vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
+; SKX_32-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
+; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
   %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
@@ -415,13 +480,13 @@ define <8 x i32> @test9(%struct.ST* %bas
 ; KNL_32-LABEL: test9:
 ; KNL_32:       # BB#0: # %entry
 ; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
-; KNL_32-NEXT:    vpbroadcastd .LCPI8_0, %ymm3
+; KNL_32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm3
 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
 ; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL_32-NEXT:    vpbroadcastd .LCPI8_1, %ymm3
+; KNL_32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm3
 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT:    vpbroadcastd .LCPI8_2, %ymm1
+; KNL_32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm1
 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
@@ -441,6 +506,18 @@ define <8 x i32> @test9(%struct.ST* %bas
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test9:
+; SKX_32:       # BB#0: # %entry
+; SKX_32-NEXT:    vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
+; SKX_32-NEXT:    vpmovqd %zmm0, %ymm0
+; SKX_32-NEXT:    vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
+; SKX_32-NEXT:    vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
+; SKX_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; SKX_32-NEXT:    vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vpgatherdd (,%ymm1), %ymm0 {%k1}
+; SKX_32-NEXT:    retl
 entry:
   %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
   %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
@@ -477,13 +554,13 @@ define <8 x i32> @test10(%struct.ST* %ba
 ; KNL_32-LABEL: test10:
 ; KNL_32:       # BB#0: # %entry
 ; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
-; KNL_32-NEXT:    vpbroadcastd .LCPI9_0, %ymm3
+; KNL_32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm3
 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
 ; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL_32-NEXT:    vpbroadcastd .LCPI9_1, %ymm3
+; KNL_32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm3
 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT:    vpbroadcastd .LCPI9_2, %ymm1
+; KNL_32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm1
 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
@@ -503,6 +580,18 @@ define <8 x i32> @test10(%struct.ST* %ba
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test10:
+; SKX_32:       # BB#0: # %entry
+; SKX_32-NEXT:    vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
+; SKX_32-NEXT:    vpmovqd %zmm0, %ymm0
+; SKX_32-NEXT:    vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
+; SKX_32-NEXT:    vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
+; SKX_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; SKX_32-NEXT:    vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vpgatherdd (,%ymm1), %ymm0 {%k1}
+; SKX_32-NEXT:    retl
 entry:
   %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
   %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
@@ -535,6 +624,14 @@ define <16 x float> @test11(float* %base
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test11:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %zmm1
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
@@ -568,6 +665,14 @@ define <16 x float> @test12(float* %base
 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test12:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
+; SKX_32-NEXT:    retl
 
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
@@ -596,6 +701,13 @@ define <16 x float> @test13(float* %base
 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test13:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
+; SKX_32-NEXT:    retl
 
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
@@ -675,25 +787,29 @@ define <4 x float> @test15(float* %base,
 ;
 ; KNL_64-LABEL: test15:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpxor %ymm2, %ymm2, %ymm2
+; KNL_64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_64-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm2
 ; KNL_64-NEXT:    vpslld $31, %ymm1, %ymm0
 ; KNL_64-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
-; KNL_64-NEXT:    # kill
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test15:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpxor %ymm2, %ymm2, %ymm2
+; KNL_32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_32-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm2
 ; KNL_32-NEXT:    vpslld $31, %ymm1, %ymm0
 ; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
-; KNL_32-NEXT:    # kill
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test15:
@@ -724,7 +840,9 @@ define <4 x double> @test16(double* %bas
 ;
 ; KNL_64-LABEL: test16:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT:    # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
@@ -738,14 +856,16 @@ define <4 x double> @test16(double* %bas
 ;
 ; KNL_32-LABEL: test16:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT:    # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
-; KNL_32-NEXT:    vpsllvq .LCPI15_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
 ; KNL_32-NEXT:    vmovapd %zmm2, %zmm0
@@ -778,7 +898,9 @@ define <2 x double> @test17(double* %bas
 ;
 ; KNL_64-LABEL: test17:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
@@ -788,10 +910,12 @@ define <2 x double> @test17(double* %bas
 ;
 ; KNL_32-LABEL: test17:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllvq .LCPI16_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
 ; KNL_32-NEXT:    vmovapd %zmm2, %zmm0
@@ -830,7 +954,10 @@ define void @test18(<4 x i32>%a1, <4 x i
 ;
 ; KNL_64-LABEL: test18:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; KNL_64-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; KNL_64-NEXT:    vpslld $31, %ymm2, %ymm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
@@ -839,7 +966,10 @@ define void @test18(<4 x i32>%a1, <4 x i
 ;
 ; KNL_32-LABEL: test18:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; KNL_32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
 ; KNL_32-NEXT:    vpslld $31, %ymm2, %ymm2
@@ -868,7 +998,9 @@ define void @test19(<4 x double>%a1, dou
 ;
 ; KNL_64-LABEL: test19:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT:    # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; KNL_64-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
@@ -880,13 +1012,15 @@ define void @test19(<4 x double>%a1, dou
 ;
 ; KNL_32-LABEL: test19:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT:    # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; KNL_32-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllvq .LCPI18_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
 ; KNL_32-NEXT:    retl
@@ -915,7 +1049,9 @@ define void @test20(<2 x float>%a1, <2 x
 ;
 ; KNL_64-LABEL: test20:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; KNL_64-NEXT:    vmovq {{.*#+}} xmm2 = xmm2[0],zero
 ; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
@@ -926,7 +1062,8 @@ define void @test20(<2 x float>%a1, <2 x
 ;
 ; KNL_32-LABEL: test20:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; KNL_32-NEXT:    vmovq {{.*#+}} xmm2 = xmm2[0],zero
 ; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
@@ -939,7 +1076,8 @@ define void @test20(<2 x float>%a1, <2 x
 ;
 ; SKX-LABEL: test20:
 ; SKX:       # BB#0:
-; SKX:         vpsllq $63, %xmm2, %xmm2
+; SKX-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
 ; SKX-NEXT:    vptestmq %xmm2, %xmm2, %k0
 ; SKX-NEXT:    kshiftlb $6, %k0, %k0
 ; SKX-NEXT:    kshiftrb $6, %k0, %k1
@@ -964,7 +1102,8 @@ define void @test21(<2 x i32>%a1, <2 x i
 ;
 ; KNL_64-LABEL: test21:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; KNL_64-NEXT:    vpsllq $63, %zmm2, %zmm2
@@ -974,17 +1113,19 @@ define void @test21(<2 x i32>%a1, <2 x i
 ;
 ; KNL_32-LABEL: test21:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT:    vpsllvq .LCPI20_0, %zmm2, %zmm2
+; KNL_32-NEXT:    vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2
 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test21:
 ; SKX:       # BB#0:
-; SKX:         vpsllq $63, %xmm2, %xmm2
+; SKX-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
 ; SKX-NEXT:    vptestmq %xmm2, %xmm2, %k0
 ; SKX-NEXT:    kshiftlb $6, %k0, %k0
 ; SKX-NEXT:    kshiftrb $6, %k0, %k1
@@ -994,7 +1135,8 @@ define void @test21(<2 x i32>%a1, <2 x i
 ;
 ; SKX_32-LABEL: test21:
 ; SKX_32:       # BB#0:
-; SKX_32:         vpsllq $63, %xmm2, %xmm2
+; SKX_32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX_32-NEXT:    vpsllq $63, %xmm2, %xmm2
 ; SKX_32-NEXT:    vptestmq %xmm2, %xmm2, %k0
 ; SKX_32-NEXT:    kshiftlb $6, %k0, %k0
 ; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
@@ -1013,7 +1155,8 @@ define <2 x float> @test22(float* %base,
 ;
 ; KNL_64-LABEL: test22:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_64-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; KNL_64-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
 ; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
@@ -1027,7 +1170,8 @@ define <2 x float> @test22(float* %base,
 ;
 ; KNL_32-LABEL: test22:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; KNL_32-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
 ; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
@@ -1075,7 +1219,9 @@ define <2 x i32> @test23(i32* %base, <2
 ;
 ; KNL_64-LABEL: test23:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
@@ -1085,10 +1231,12 @@ define <2 x i32> @test23(i32* %base, <2
 ;
 ; KNL_32-LABEL: test23:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllvq .LCPI22_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
 ; KNL_32-NEXT:    vmovdqa64 %zmm2, %zmm0
@@ -1119,7 +1267,8 @@ define <2 x i32> @test23(i32* %base, <2
 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
 ; KNL_64-LABEL: test24:
 ; KNL_64:       # BB#0:
-; KNL_64:         movb $3, %al
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_64-NEXT:    movb $3, %al
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
 ; KNL_64-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -1127,10 +1276,11 @@ define <2 x i32> @test24(i32* %base, <2
 ;
 ; KNL_32-LABEL: test24:
 ; KNL_32:       # BB#0:
-; KNL_32:         movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; KNL_32-NEXT:    vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
-; KNL_32-NEXT:    vpsllvq .LCPI23_1, %zmm1, %zmm1
+; KNL_32-NEXT:    vinserti32x4 $0, {{\.LCPI.*}}, %zmm1, %zmm1
+; KNL_32-NEXT:    vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
 ; KNL_32-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -1160,7 +1310,9 @@ define <2 x i64> @test25(i64* %base, <2
 ;
 ; KNL_64-LABEL: test25:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
@@ -1170,10 +1322,12 @@ define <2 x i64> @test25(i64* %base, <2
 ;
 ; KNL_32-LABEL: test25:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllvq .LCPI24_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
 ; KNL_32-NEXT:    vmovdqa64 %zmm2, %zmm0
@@ -1205,7 +1359,9 @@ define <2 x i64> @test26(i64* %base, <2
 ;
 ; KNL_64-LABEL: test26:
 ; KNL_64:       # BB#0:
-; KNL_64:         movb $3, %al
+; KNL_64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_64-NEXT:    movb $3, %al
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
 ; KNL_64-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -1213,10 +1369,12 @@ define <2 x i64> @test26(i64* %base, <2
 ;
 ; KNL_32-LABEL: test26:
 ; KNL_32:       # BB#0:
-; KNL_32:         movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
-; KNL_32-NEXT:    vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
-; KNL_32-NEXT:    vpsllvq .LCPI25_1, %zmm2, %zmm2
+; KNL_32-NEXT:    vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2
+; KNL_32-NEXT:    vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2
 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
 ; KNL_32-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -1252,7 +1410,7 @@ define <2 x float> @test27(float* %base,
 ; KNL_64-NEXT:    movb $3, %al
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT:    # kill
+; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test27:
@@ -1263,7 +1421,7 @@ define <2 x float> @test27(float* %base,
 ; KNL_32-NEXT:    movb $3, %cl
 ; KNL_32-NEXT:    kmovw %ecx, %k1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT:    # kill
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test27:
@@ -1273,6 +1431,15 @@ define <2 x float> @test27(float* %base,
 ; SKX-NEXT:    kmovb %eax, %k1
 ; SKX-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test27:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    movb $3, %cl
+; SKX_32-NEXT:    kmovb %ecx, %k1
+; SKX_32-NEXT:    vgatherdps (%eax,%xmm1,4), %xmm0 {%k1}
+; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
   %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
@@ -1285,7 +1452,8 @@ define void @test28(<2 x i32>%a1, <2 x i
 ;
 ; KNL_64-LABEL: test28:
 ; KNL_64:       # BB#0:
-; KNL_64:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; KNL_64-NEXT:    movb $3, %al
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
@@ -1293,17 +1461,19 @@ define void @test28(<2 x i32>%a1, <2 x i
 ;
 ; KNL_32-LABEL: test28:
 ; KNL_32:       # BB#0:
-; KNL_32:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
-; KNL_32-NEXT:    vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
-; KNL_32-NEXT:    vpsllvq .LCPI27_1, %zmm2, %zmm2
+; KNL_32-NEXT:    vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2
+; KNL_32-NEXT:    vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2
 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test28:
 ; SKX:       # BB#0:
-; SKX:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX-NEXT:    movb $3, %al
 ; SKX-NEXT:    kmovb %eax, %k1
 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
@@ -1311,7 +1481,8 @@ define void @test28(<2 x i32>%a1, <2 x i
 ;
 ; SKX_32-LABEL: test28:
 ; SKX_32:       # BB#0:
-; SKX_32:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX_32-NEXT:    movb $3, %al
 ; SKX_32-NEXT:    kmovb %eax, %k1
 ; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
@@ -1353,6 +1524,15 @@ define <16 x float> @test29(float* %base
 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test29:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    movw $44, %cx
+; SKX_32-NEXT:    kmovw %ecx, %k1
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
+; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
@@ -1370,9 +1550,12 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; KNL_64-LABEL: test30:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    andl $1, %edx
+; KNL_64-NEXT:    kmovw %edx, %k1
 ; KNL_64-NEXT:    andl $1, %esi
+; KNL_64-NEXT:    kmovw %esi, %k2
 ; KNL_64-NEXT:    movl %edi, %eax
 ; KNL_64-NEXT:    andl $1, %eax
+; KNL_64-NEXT:    kmovw %eax, %k0
 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
 ; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
@@ -1380,76 +1563,81 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; KNL_64-NEXT:    testb $1, %dil
 ; KNL_64-NEXT:    je .LBB29_2
 ; KNL_64-NEXT:  # BB#1: # %cond.load
-; KNL_64-NEXT:    vmovq %xmm1, %rcx
+; KNL_64-NEXT:    vmovq %xmm1, %rax
 ; KNL_64-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; KNL_64-NEXT:  .LBB29_2: # %else
-; KNL_64-NEXT:    testb %sil, %sil
+; KNL_64-NEXT:    kmovw %k2, %eax
+; KNL_64-NEXT:    movl %eax, %ecx
+; KNL_64-NEXT:    andl $1, %ecx
+; KNL_64-NEXT:    testb %cl, %cl
 ; KNL_64-NEXT:    je .LBB29_4
 ; KNL_64-NEXT:  # BB#3: # %cond.load1
 ; KNL_64-NEXT:    vpextrq $1, %xmm1, %rcx
 ; KNL_64-NEXT:    vpinsrd $1, (%rcx), %xmm0, %xmm0
 ; KNL_64-NEXT:  .LBB29_4: # %else2
+; KNL_64-NEXT:    kmovw %k1, %ecx
+; KNL_64-NEXT:    movl %ecx, %edx
+; KNL_64-NEXT:    andl $1, %edx
 ; KNL_64-NEXT:    testb %dl, %dl
 ; KNL_64-NEXT:    je .LBB29_6
 ; KNL_64-NEXT:  # BB#5: # %cond.load4
 ; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; KNL_64-NEXT:    vmovq %xmm1, %rcx
-; KNL_64-NEXT:    vpinsrd $2, (%rcx), %xmm0, %xmm0
+; KNL_64-NEXT:    vmovq %xmm1, %rdx
+; KNL_64-NEXT:    vpinsrd $2, (%rdx), %xmm0, %xmm0
 ; KNL_64-NEXT:  .LBB29_6: # %else5
-; KNL_64-NEXT:    vmovd %eax, %xmm1
-; KNL_64-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
-; KNL_64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
+; KNL_64-NEXT:    kmovw %k0, %edx
+; KNL_64-NEXT:    vmovd %edx, %xmm1
+; KNL_64-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_64-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
 ; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_64-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test30:
 ; KNL_32:       # BB#0:
-; KNL_32-NEXT:    pushl %ebx
-; KNL_32-NEXT:  .Ltmp0:
-; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:    pushl %esi
-; KNL_32-NEXT:  .Ltmp1:
-; KNL_32-NEXT:    .cfi_def_cfa_offset 12
-; KNL_32-NEXT:  .Ltmp2:
-; KNL_32-NEXT:    .cfi_offset %esi, -12
-; KNL_32-NEXT:  .Ltmp3:
-; KNL_32-NEXT:    .cfi_offset %ebx, -8
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    andl $1, %eax
-; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; KNL_32-NEXT:    kmovw %eax, %k1
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    andl $1, %eax
+; KNL_32-NEXT:    kmovw %eax, %k2
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    movl %eax, %ecx
 ; KNL_32-NEXT:    andl $1, %ecx
-; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; KNL_32-NEXT:    movl %ebx, %edx
-; KNL_32-NEXT:    andl $1, %edx
+; KNL_32-NEXT:    kmovw %ecx, %k0
 ; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; KNL_32-NEXT:    # implicit-def: %XMM0
-; KNL_32-NEXT:    testb $1, %bl
+; KNL_32-NEXT:    testb $1, %al
 ; KNL_32-NEXT:    je .LBB29_2
 ; KNL_32-NEXT:  # BB#1: # %cond.load
-; KNL_32-NEXT:    vmovd %xmm1, %esi
+; KNL_32-NEXT:    vmovd %xmm1, %eax
 ; KNL_32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; KNL_32-NEXT:  .LBB29_2: # %else
+; KNL_32-NEXT:    kmovw %k2, %eax
+; KNL_32-NEXT:    movl %eax, %ecx
+; KNL_32-NEXT:    andl $1, %ecx
 ; KNL_32-NEXT:    testb %cl, %cl
 ; KNL_32-NEXT:    je .LBB29_4
 ; KNL_32-NEXT:  # BB#3: # %cond.load1
-; KNL_32-NEXT:    vpextrd $1, %xmm1, %esi
-; KNL_32-NEXT:    vpinsrd $1, (%esi), %xmm0, %xmm0
+; KNL_32-NEXT:    vpextrd $1, %xmm1, %ecx
+; KNL_32-NEXT:    vpinsrd $1, (%ecx), %xmm0, %xmm0
 ; KNL_32-NEXT:  .LBB29_4: # %else2
-; KNL_32-NEXT:    testb %al, %al
+; KNL_32-NEXT:    kmovw %k1, %ecx
+; KNL_32-NEXT:    movl %ecx, %edx
+; KNL_32-NEXT:    andl $1, %edx
+; KNL_32-NEXT:    testb %dl, %dl
 ; KNL_32-NEXT:    je .LBB29_6
 ; KNL_32-NEXT:  # BB#5: # %cond.load4
-; KNL_32-NEXT:    vpextrd $2, %xmm1, %esi
-; KNL_32-NEXT:    vpinsrd $2, (%esi), %xmm0, %xmm0
+; KNL_32-NEXT:    vpextrd $2, %xmm1, %edx
+; KNL_32-NEXT:    vpinsrd $2, (%edx), %xmm0, %xmm0
 ; KNL_32-NEXT:  .LBB29_6: # %else5
+; KNL_32-NEXT:    kmovw %k0, %edx
 ; KNL_32-NEXT:    vmovd %edx, %xmm1
-; KNL_32-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; KNL_32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
+; KNL_32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_32-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_32-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
-; KNL_32-NEXT:    popl %esi
-; KNL_32-NEXT:    popl %ebx
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test30:
@@ -1460,35 +1648,38 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
-; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; SKX-NEXT:    kmovw %k0, %eax
-; SKX-NEXT:    # implicit-def: %XMM0
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    # implicit-def: %XMM1
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    je .LBB29_2
 ; SKX-NEXT:  # BB#1: # %cond.load
-; SKX-NEXT:    vmovq %xmm1, %rax
-; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    vmovq %xmm0, %rax
+; SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SKX-NEXT:  .LBB29_2: # %else
 ; SKX-NEXT:    kshiftlw $14, %k1, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    je .LBB29_4
 ; SKX-NEXT:  # BB#3: # %cond.load1
-; SKX-NEXT:    vpextrq $1, %xmm1, %rax
-; SKX-NEXT:    vpinsrd $1, (%rax), %xmm0, %xmm0
+; SKX-NEXT:    vpextrq $1, %xmm0, %rax
+; SKX-NEXT:    vpinsrd $1, (%rax), %xmm1, %xmm1
 ; SKX-NEXT:  .LBB29_4: # %else2
 ; SKX-NEXT:    kshiftlw $13, %k1, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    je .LBB29_6
 ; SKX-NEXT:  # BB#5: # %cond.load4
-; SKX-NEXT:    vextracti64x2 $1, %ymm1, %xmm1
-; SKX-NEXT:    vmovq %xmm1, %rax
-; SKX-NEXT:    vpinsrd $2, (%rax), %xmm0, %xmm0
+; SKX-NEXT:    vextracti64x2 $1, %ymm0, %xmm0
+; SKX-NEXT:    vmovq %xmm0, %rax
+; SKX-NEXT:    vpinsrd $2, (%rax), %xmm1, %xmm1
 ; SKX-NEXT:  .LBB29_6: # %else5
-; SKX-NEXT:    vpblendmd %xmm0, %xmm3, %xmm0 {%k1}
+; SKX-NEXT:    vpblendmd %xmm1, %xmm3, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test30:
@@ -1501,35 +1692,38 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; SKX_32-NEXT:    kshiftlw $15, %k1, %k0
 ; SKX_32-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX_32-NEXT:    vpslld $2, %xmm1, %xmm1
-; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; SKX_32-NEXT:    kmovw %k0, %eax
-; SKX_32-NEXT:    # implicit-def: %XMM0
+; SKX_32-NEXT:    andl $1, %eax
+; SKX_32-NEXT:    # implicit-def: %XMM1
 ; SKX_32-NEXT:    testb %al, %al
 ; SKX_32-NEXT:    je .LBB29_2
 ; SKX_32-NEXT:  # BB#1: # %cond.load
-; SKX_32-NEXT:    vmovd %xmm1, %eax
-; SKX_32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX_32-NEXT:    vmovd %xmm0, %eax
+; SKX_32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SKX_32-NEXT:  .LBB29_2: # %else
 ; SKX_32-NEXT:    kshiftlw $14, %k1, %k0
 ; SKX_32-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX_32-NEXT:    kmovw %k0, %eax
+; SKX_32-NEXT:    andl $1, %eax
 ; SKX_32-NEXT:    testb %al, %al
 ; SKX_32-NEXT:    je .LBB29_4
 ; SKX_32-NEXT:  # BB#3: # %cond.load1
-; SKX_32-NEXT:    vpextrd $1, %xmm1, %eax
-; SKX_32-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
+; SKX_32-NEXT:    vpextrd $1, %xmm0, %eax
+; SKX_32-NEXT:    vpinsrd $1, (%eax), %xmm1, %xmm1
 ; SKX_32-NEXT:  .LBB29_4: # %else2
 ; SKX_32-NEXT:    vmovdqa32 {{[0-9]+}}(%esp), %xmm2
 ; SKX_32-NEXT:    kshiftlw $13, %k1, %k0
 ; SKX_32-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX_32-NEXT:    kmovw %k0, %eax
+; SKX_32-NEXT:    andl $1, %eax
 ; SKX_32-NEXT:    testb %al, %al
 ; SKX_32-NEXT:    je .LBB29_6
 ; SKX_32-NEXT:  # BB#5: # %cond.load4
-; SKX_32-NEXT:    vpextrd $2, %xmm1, %eax
-; SKX_32-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0
+; SKX_32-NEXT:    vpextrd $2, %xmm0, %eax
+; SKX_32-NEXT:    vpinsrd $2, (%eax), %xmm1, %xmm1
 ; SKX_32-NEXT:  .LBB29_6: # %else5
-; SKX_32-NEXT:    vpblendmd %xmm0, %xmm2, %xmm0 {%k1}
+; SKX_32-NEXT:    vpblendmd %xmm1, %xmm2, %xmm0 {%k1}
 ; SKX_32-NEXT:    addl $12, %esp
 ; SKX_32-NEXT:    retl
 
@@ -1646,12 +1840,12 @@ define <16 x i64> @test_gather_16i64(<16
 ; KNL_32-LABEL: test_gather_16i64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Ltmp4:
+; KNL_32-NEXT:  .Ltmp0:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Ltmp5:
+; KNL_32-NEXT:  .Ltmp1:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Ltmp6:
+; KNL_32-NEXT:  .Ltmp2:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1769,12 +1963,12 @@ define <16 x double> @test_gather_16f64(
 ; KNL_32-LABEL: test_gather_16f64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Ltmp7:
+; KNL_32-NEXT:  .Ltmp3:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Ltmp8:
+; KNL_32-NEXT:  .Ltmp4:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Ltmp9:
+; KNL_32-NEXT:  .Ltmp5:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1886,12 +2080,12 @@ define void @test_scatter_16i64(<16 x i6
 ; KNL_32-LABEL: test_scatter_16i64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Ltmp10:
+; KNL_32-NEXT:  .Ltmp6:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Ltmp11:
+; KNL_32-NEXT:  .Ltmp7:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Ltmp12:
+; KNL_32-NEXT:  .Ltmp8:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -2000,12 +2194,12 @@ define void @test_scatter_16f64(<16 x do
 ; KNL_32-LABEL: test_scatter_16f64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Ltmp13:
+; KNL_32-NEXT:  .Ltmp9:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Ltmp14:
+; KNL_32-NEXT:  .Ltmp10:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Ltmp15:
+; KNL_32-NEXT:  .Ltmp11:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp

Modified: llvm/trunk/test/CodeGen/X86/masked_memop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_memop.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_memop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_memop.ll Thu Aug 25 16:55:41 2016
@@ -2346,6 +2346,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    ## implicit-def: %XMM0
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_2
@@ -2356,6 +2357,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $14, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_4
 ; AVX512F-NEXT:  ## BB#3: ## %cond.load1
@@ -2364,6 +2366,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $13, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_6
 ; AVX512F-NEXT:  ## BB#5: ## %cond.load4
@@ -2372,6 +2375,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $12, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_8
 ; AVX512F-NEXT:  ## BB#7: ## %cond.load7
@@ -2380,6 +2384,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $11, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_10
 ; AVX512F-NEXT:  ## BB#9: ## %cond.load10
@@ -2388,6 +2393,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $10, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_12
 ; AVX512F-NEXT:  ## BB#11: ## %cond.load13
@@ -2396,6 +2402,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $9, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_14
 ; AVX512F-NEXT:  ## BB#13: ## %cond.load16
@@ -2404,6 +2411,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $8, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_16
 ; AVX512F-NEXT:  ## BB#15: ## %cond.load19
@@ -2412,6 +2420,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $7, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_18
 ; AVX512F-NEXT:  ## BB#17: ## %cond.load22
@@ -2420,6 +2429,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $6, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_20
 ; AVX512F-NEXT:  ## BB#19: ## %cond.load25
@@ -2428,6 +2438,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $5, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_22
 ; AVX512F-NEXT:  ## BB#21: ## %cond.load28
@@ -2436,6 +2447,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $4, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_24
 ; AVX512F-NEXT:  ## BB#23: ## %cond.load31
@@ -2444,6 +2456,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $3, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_26
 ; AVX512F-NEXT:  ## BB#25: ## %cond.load34
@@ -2452,6 +2465,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $2, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_28
 ; AVX512F-NEXT:  ## BB#27: ## %cond.load37
@@ -2460,6 +2474,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:    kshiftlw $1, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_30
 ; AVX512F-NEXT:  ## BB#29: ## %cond.load40
@@ -2467,6 +2482,7 @@ define <16 x i8> @test_mask_load_16xi8(<
 ; AVX512F-NEXT:  LBB50_30: ## %else41
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB50_32
 ; AVX512F-NEXT:  ## BB#31: ## %cond.load43
@@ -4613,6 +4629,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_2
 ; AVX512F-NEXT:  ## BB#1: ## %cond.load
@@ -4623,6 +4640,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_4
 ; AVX512F-NEXT:  ## BB#3: ## %cond.load1
@@ -4633,6 +4651,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_6
 ; AVX512F-NEXT:  ## BB#5: ## %cond.load4
@@ -4643,6 +4662,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_8
 ; AVX512F-NEXT:  ## BB#7: ## %cond.load7
@@ -4653,6 +4673,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_10
 ; AVX512F-NEXT:  ## BB#9: ## %cond.load10
@@ -4663,6 +4684,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_12
 ; AVX512F-NEXT:  ## BB#11: ## %cond.load13
@@ -4673,6 +4695,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_14
 ; AVX512F-NEXT:  ## BB#13: ## %cond.load16
@@ -4683,6 +4706,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_16
 ; AVX512F-NEXT:  ## BB#15: ## %cond.load19
@@ -4693,6 +4717,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_18
 ; AVX512F-NEXT:  ## BB#17: ## %cond.load22
@@ -4703,6 +4728,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, (%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_20
 ; AVX512F-NEXT:  ## BB#19: ## %cond.load25
@@ -4713,6 +4739,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_22
 ; AVX512F-NEXT:  ## BB#21: ## %cond.load28
@@ -4723,6 +4750,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_24
 ; AVX512F-NEXT:  ## BB#23: ## %cond.load31
@@ -4733,6 +4761,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_26
 ; AVX512F-NEXT:  ## BB#25: ## %cond.load34
@@ -4744,6 +4773,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_28
 ; AVX512F-NEXT:  ## BB#27: ## %cond.load37
@@ -4755,6 +4785,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_30
 ; AVX512F-NEXT:  ## BB#29: ## %cond.load40
@@ -4765,6 +4796,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_32
 ; AVX512F-NEXT:  ## BB#31: ## %cond.load43
@@ -4775,6 +4807,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_34
 ; AVX512F-NEXT:  ## BB#33: ## %cond.load46
@@ -4786,6 +4819,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_36
 ; AVX512F-NEXT:  ## BB#35: ## %cond.load49
@@ -4797,6 +4831,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_38
 ; AVX512F-NEXT:  ## BB#37: ## %cond.load52
@@ -4808,6 +4843,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_40
 ; AVX512F-NEXT:  ## BB#39: ## %cond.load55
@@ -4819,6 +4855,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_42
 ; AVX512F-NEXT:  ## BB#41: ## %cond.load58
@@ -4830,6 +4867,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_44
 ; AVX512F-NEXT:  ## BB#43: ## %cond.load61
@@ -4841,6 +4879,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_46
 ; AVX512F-NEXT:  ## BB#45: ## %cond.load64
@@ -4852,6 +4891,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_48
 ; AVX512F-NEXT:  ## BB#47: ## %cond.load67
@@ -4863,6 +4903,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_50
 ; AVX512F-NEXT:  ## BB#49: ## %cond.load70
@@ -4874,6 +4915,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_52
 ; AVX512F-NEXT:  ## BB#51: ## %cond.load73
@@ -4885,6 +4927,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_54
 ; AVX512F-NEXT:  ## BB#53: ## %cond.load76
@@ -4896,6 +4939,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_56
 ; AVX512F-NEXT:  ## BB#55: ## %cond.load79
@@ -4907,6 +4951,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_58
 ; AVX512F-NEXT:  ## BB#57: ## %cond.load82
@@ -4919,6 +4964,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_60
 ; AVX512F-NEXT:  ## BB#59: ## %cond.load85
@@ -4931,6 +4977,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_62
 ; AVX512F-NEXT:  ## BB#61: ## %cond.load88
@@ -4942,6 +4989,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_64
 ; AVX512F-NEXT:  ## BB#63: ## %cond.load91
@@ -4953,6 +5001,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_66
 ; AVX512F-NEXT:  ## BB#65: ## %cond.load94
@@ -4963,6 +5012,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_68
 ; AVX512F-NEXT:  ## BB#67: ## %cond.load97
@@ -4973,6 +5023,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_70
 ; AVX512F-NEXT:  ## BB#69: ## %cond.load100
@@ -4983,6 +5034,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_72
 ; AVX512F-NEXT:  ## BB#71: ## %cond.load103
@@ -4993,6 +5045,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_74
 ; AVX512F-NEXT:  ## BB#73: ## %cond.load106
@@ -5003,6 +5056,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_76
 ; AVX512F-NEXT:  ## BB#75: ## %cond.load109
@@ -5013,6 +5067,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_78
 ; AVX512F-NEXT:  ## BB#77: ## %cond.load112
@@ -5023,6 +5078,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_80
 ; AVX512F-NEXT:  ## BB#79: ## %cond.load115
@@ -5033,6 +5089,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_82
 ; AVX512F-NEXT:  ## BB#81: ## %cond.load118
@@ -5043,6 +5100,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_84
 ; AVX512F-NEXT:  ## BB#83: ## %cond.load121
@@ -5053,6 +5111,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_86
 ; AVX512F-NEXT:  ## BB#85: ## %cond.load124
@@ -5063,6 +5122,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_88
 ; AVX512F-NEXT:  ## BB#87: ## %cond.load127
@@ -5073,6 +5133,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_90
 ; AVX512F-NEXT:  ## BB#89: ## %cond.load130
@@ -5084,6 +5145,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_92
 ; AVX512F-NEXT:  ## BB#91: ## %cond.load133
@@ -5095,26 +5157,29 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_94
 ; AVX512F-NEXT:  ## BB#93: ## %cond.load136
 ; AVX512F-NEXT:    vpinsrb $14, 46(%rdi), %xmm1, %xmm3
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:  LBB52_94: ## %else137
-; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k5
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_96
 ; AVX512F-NEXT:  ## BB#95: ## %cond.load139
 ; AVX512F-NEXT:    vpinsrb $15, 47(%rdi), %xmm1, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:  LBB52_96: ## %else140
-; AVX512F-NEXT:    kshiftlw $15, %k1, %k0
+; AVX512F-NEXT:    kshiftlw $15, %k5, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_98
 ; AVX512F-NEXT:  ## BB#97: ## %cond.load142
@@ -5122,10 +5187,11 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $0, 48(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_98: ## %else143
-; AVX512F-NEXT:    kshiftlw $14, %k1, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k5, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_100
 ; AVX512F-NEXT:  ## BB#99: ## %cond.load145
@@ -5133,10 +5199,11 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $1, 49(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_100: ## %else146
-; AVX512F-NEXT:    kshiftlw $13, %k1, %k0
+; AVX512F-NEXT:    kshiftlw $13, %k5, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_102
 ; AVX512F-NEXT:  ## BB#101: ## %cond.load148
@@ -5144,10 +5211,11 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $2, 50(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_102: ## %else149
-; AVX512F-NEXT:    kshiftlw $12, %k1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k5, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_104
 ; AVX512F-NEXT:  ## BB#103: ## %cond.load151
@@ -5155,10 +5223,11 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $3, 51(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_104: ## %else152
-; AVX512F-NEXT:    kshiftlw $11, %k1, %k0
+; AVX512F-NEXT:    kshiftlw $11, %k5, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_106
 ; AVX512F-NEXT:  ## BB#105: ## %cond.load154
@@ -5166,10 +5235,11 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $4, 52(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_106: ## %else155
-; AVX512F-NEXT:    kshiftlw $10, %k1, %k0
+; AVX512F-NEXT:    kshiftlw $10, %k5, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_108
 ; AVX512F-NEXT:  ## BB#107: ## %cond.load157
@@ -5177,10 +5247,11 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $5, 53(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_108: ## %else158
-; AVX512F-NEXT:    kshiftlw $9, %k1, %k0
+; AVX512F-NEXT:    kshiftlw $9, %k5, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_110
 ; AVX512F-NEXT:  ## BB#109: ## %cond.load160
@@ -5188,10 +5259,11 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $6, 54(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_110: ## %else161
-; AVX512F-NEXT:    kshiftlw $8, %k1, %k0
+; AVX512F-NEXT:    kshiftlw $8, %k5, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_112
 ; AVX512F-NEXT:  ## BB#111: ## %cond.load163
@@ -5199,10 +5271,11 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $7, 55(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_112: ## %else164
-; AVX512F-NEXT:    kshiftlw $7, %k1, %k0
+; AVX512F-NEXT:    kshiftlw $7, %k5, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_114
 ; AVX512F-NEXT:  ## BB#113: ## %cond.load166
@@ -5210,9 +5283,10 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $8, 56(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_114: ## %else167
-; AVX512F-NEXT:    kshiftlw $6, %k1, %k2
-; AVX512F-NEXT:    kshiftrw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftlw $6, %k5, %k0
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k2
 ; AVX512F-NEXT:    kmovw %k2, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_116
 ; AVX512F-NEXT:  ## BB#115: ## %cond.load169
@@ -5220,9 +5294,10 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $9, 57(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_116: ## %else170
-; AVX512F-NEXT:    kshiftlw $5, %k1, %k3
-; AVX512F-NEXT:    kshiftrw $15, %k3, %k3
+; AVX512F-NEXT:    kshiftlw $5, %k5, %k0
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k3
 ; AVX512F-NEXT:    kmovw %k3, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_118
 ; AVX512F-NEXT:  ## BB#117: ## %cond.load172
@@ -5230,9 +5305,10 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $10, 58(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_118: ## %else173
-; AVX512F-NEXT:    kshiftlw $4, %k1, %k4
-; AVX512F-NEXT:    kshiftrw $15, %k4, %k4
+; AVX512F-NEXT:    kshiftlw $4, %k5, %k0
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k4
 ; AVX512F-NEXT:    kmovw %k4, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_120
 ; AVX512F-NEXT:  ## BB#119: ## %cond.load175
@@ -5240,9 +5316,10 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $11, 59(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_120: ## %else176
-; AVX512F-NEXT:    kshiftlw $3, %k1, %k5
-; AVX512F-NEXT:    kshiftrw $15, %k5, %k5
-; AVX512F-NEXT:    kmovw %k5, %eax
+; AVX512F-NEXT:    kshiftlw $3, %k5, %k0
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k6
+; AVX512F-NEXT:    kmovw %k6, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_122
 ; AVX512F-NEXT:  ## BB#121: ## %cond.load178
@@ -5250,9 +5327,10 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $12, 60(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_122: ## %else179
-; AVX512F-NEXT:    kshiftlw $2, %k1, %k6
-; AVX512F-NEXT:    kshiftrw $15, %k6, %k6
-; AVX512F-NEXT:    kmovw %k6, %eax
+; AVX512F-NEXT:    kshiftlw $2, %k5, %k0
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k7
+; AVX512F-NEXT:    kmovw %k7, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_124
 ; AVX512F-NEXT:  ## BB#123: ## %cond.load181
@@ -5260,9 +5338,10 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $13, 61(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_124: ## %else182
-; AVX512F-NEXT:    kshiftlw $1, %k1, %k7
-; AVX512F-NEXT:    kshiftrw $15, %k7, %k7
-; AVX512F-NEXT:    kmovw %k7, %eax
+; AVX512F-NEXT:    kshiftlw $1, %k5, %k0
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_126
 ; AVX512F-NEXT:  ## BB#125: ## %cond.load184
@@ -5270,8 +5349,9 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $14, 62(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_126: ## %else185
-; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    kshiftrw $15, %k5, %k5
+; AVX512F-NEXT:    kmovw %k5, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB52_128
 ; AVX512F-NEXT:  ## BB#127: ## %cond.load187
@@ -5279,137 +5359,137 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $15, 63(%rdi), %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:  LBB52_128: ## %else188
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw (%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw (%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, (%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; AVX512F-NEXT:    kmovw %k1, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
 ; AVX512F-NEXT:    kmovw %k2, %eax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
 ; AVX512F-NEXT:    kmovw %k3, %r12d
 ; AVX512F-NEXT:    kmovw %k4, %r15d
-; AVX512F-NEXT:    kmovw %k5, %r14d
-; AVX512F-NEXT:    kmovw %k6, %ebx
-; AVX512F-NEXT:    kmovw %k7, %r11d
-; AVX512F-NEXT:    kmovw %k1, %r10d
+; AVX512F-NEXT:    kmovw %k6, %r14d
+; AVX512F-NEXT:    kmovw %k7, %ebx
+; AVX512F-NEXT:    kmovw %k0, %r11d
+; AVX512F-NEXT:    kmovw %k5, %r10d
 ; AVX512F-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
@@ -5482,7 +5562,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    kmovw %k0, %r15d
 ; AVX512F-NEXT:    vpinsrb $12, %r14d, %xmm6, %xmm6
 ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; AVX512F-NEXT:    kmovw %k0, %r14d
+; AVX512F-NEXT:    kmovw %k0, %ebp
 ; AVX512F-NEXT:    vpinsrb $13, %ebx, %xmm6, %xmm6
 ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
 ; AVX512F-NEXT:    kmovw %k0, %ebx
@@ -5510,7 +5590,7 @@ define <64 x i8> @test_mask_load_64xi8(<
 ; AVX512F-NEXT:    vpinsrb $8, %r13d, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpinsrb $9, %r12d, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpinsrb $10, %r15d, %xmm2, %xmm2
-; AVX512F-NEXT:    vpinsrb $11, %r14d, %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $11, %ebp, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpinsrb $12, %ebx, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpinsrb $13, %r11d, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpinsrb $14, %r10d, %xmm2, %xmm2
@@ -5609,6 +5689,7 @@ define <8 x i16> @test_mask_load_8xi16(<
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    ## implicit-def: %XMM0
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB53_2
@@ -5619,6 +5700,7 @@ define <8 x i16> @test_mask_load_8xi16(<
 ; AVX512F-NEXT:    kshiftlw $14, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB53_4
 ; AVX512F-NEXT:  ## BB#3: ## %cond.load1
@@ -5627,6 +5709,7 @@ define <8 x i16> @test_mask_load_8xi16(<
 ; AVX512F-NEXT:    kshiftlw $13, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB53_6
 ; AVX512F-NEXT:  ## BB#5: ## %cond.load4
@@ -5635,6 +5718,7 @@ define <8 x i16> @test_mask_load_8xi16(<
 ; AVX512F-NEXT:    kshiftlw $12, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB53_8
 ; AVX512F-NEXT:  ## BB#7: ## %cond.load7
@@ -5643,6 +5727,7 @@ define <8 x i16> @test_mask_load_8xi16(<
 ; AVX512F-NEXT:    kshiftlw $11, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB53_10
 ; AVX512F-NEXT:  ## BB#9: ## %cond.load10
@@ -5651,6 +5736,7 @@ define <8 x i16> @test_mask_load_8xi16(<
 ; AVX512F-NEXT:    kshiftlw $10, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB53_12
 ; AVX512F-NEXT:  ## BB#11: ## %cond.load13
@@ -5659,6 +5745,7 @@ define <8 x i16> @test_mask_load_8xi16(<
 ; AVX512F-NEXT:    kshiftlw $9, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB53_14
 ; AVX512F-NEXT:  ## BB#13: ## %cond.load16
@@ -5667,6 +5754,7 @@ define <8 x i16> @test_mask_load_8xi16(<
 ; AVX512F-NEXT:    kshiftlw $8, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB53_16
 ; AVX512F-NEXT:  ## BB#15: ## %cond.load19
@@ -5963,6 +6051,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    ## implicit-def: %YMM0
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_2
@@ -5973,6 +6062,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $14, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_4
 ; AVX512F-NEXT:  ## BB#3: ## %cond.load1
@@ -5982,6 +6072,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $13, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_6
 ; AVX512F-NEXT:  ## BB#5: ## %cond.load4
@@ -5991,6 +6082,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $12, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_8
 ; AVX512F-NEXT:  ## BB#7: ## %cond.load7
@@ -6000,6 +6092,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $11, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_10
 ; AVX512F-NEXT:  ## BB#9: ## %cond.load10
@@ -6009,6 +6102,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $10, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_12
 ; AVX512F-NEXT:  ## BB#11: ## %cond.load13
@@ -6018,6 +6112,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $9, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_14
 ; AVX512F-NEXT:  ## BB#13: ## %cond.load16
@@ -6027,6 +6122,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $8, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_16
 ; AVX512F-NEXT:  ## BB#15: ## %cond.load19
@@ -6036,6 +6132,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $7, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_18
 ; AVX512F-NEXT:  ## BB#17: ## %cond.load22
@@ -6046,6 +6143,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $6, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_20
 ; AVX512F-NEXT:  ## BB#19: ## %cond.load25
@@ -6056,6 +6154,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $5, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_22
 ; AVX512F-NEXT:  ## BB#21: ## %cond.load28
@@ -6066,6 +6165,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $4, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_24
 ; AVX512F-NEXT:  ## BB#23: ## %cond.load31
@@ -6076,6 +6176,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $3, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_26
 ; AVX512F-NEXT:  ## BB#25: ## %cond.load34
@@ -6086,6 +6187,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $2, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_28
 ; AVX512F-NEXT:  ## BB#27: ## %cond.load37
@@ -6096,6 +6198,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:    kshiftlw $1, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_30
 ; AVX512F-NEXT:  ## BB#29: ## %cond.load40
@@ -6105,6 +6208,7 @@ define <16 x i16> @test_mask_load_16xi16
 ; AVX512F-NEXT:  LBB54_30: ## %else41
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB54_32
 ; AVX512F-NEXT:  ## BB#31: ## %cond.load43
@@ -7022,6 +7126,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $15, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_2
 ; AVX512F-NEXT:  ## BB#1: ## %cond.store
@@ -7030,6 +7135,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_4
 ; AVX512F-NEXT:  ## BB#3: ## %cond.store1
@@ -7038,6 +7144,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $13, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_6
 ; AVX512F-NEXT:  ## BB#5: ## %cond.store3
@@ -7046,6 +7153,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_8
 ; AVX512F-NEXT:  ## BB#7: ## %cond.store5
@@ -7054,6 +7162,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $11, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_10
 ; AVX512F-NEXT:  ## BB#9: ## %cond.store7
@@ -7062,6 +7171,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $10, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_12
 ; AVX512F-NEXT:  ## BB#11: ## %cond.store9
@@ -7070,6 +7180,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $9, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_14
 ; AVX512F-NEXT:  ## BB#13: ## %cond.store11
@@ -7078,6 +7189,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_16
 ; AVX512F-NEXT:  ## BB#15: ## %cond.store13
@@ -7086,6 +7198,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $7, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_18
 ; AVX512F-NEXT:  ## BB#17: ## %cond.store15
@@ -7094,6 +7207,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $6, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_20
 ; AVX512F-NEXT:  ## BB#19: ## %cond.store17
@@ -7102,6 +7216,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $5, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_22
 ; AVX512F-NEXT:  ## BB#21: ## %cond.store19
@@ -7110,6 +7225,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $4, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_24
 ; AVX512F-NEXT:  ## BB#23: ## %cond.store21
@@ -7118,6 +7234,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $3, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_26
 ; AVX512F-NEXT:  ## BB#25: ## %cond.store23
@@ -7126,6 +7243,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $2, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_28
 ; AVX512F-NEXT:  ## BB#27: ## %cond.store25
@@ -7134,6 +7252,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:    kshiftlw $1, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_30
 ; AVX512F-NEXT:  ## BB#29: ## %cond.store27
@@ -7141,6 +7260,7 @@ define void @test_mask_store_16xi8(<16 x
 ; AVX512F-NEXT:  LBB56_30: ## %else28
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB56_32
 ; AVX512F-NEXT:  ## BB#31: ## %cond.store29
@@ -8653,6 +8773,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $15, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_2
 ; AVX512F-NEXT:  ## BB#1: ## %cond.store
@@ -8661,6 +8782,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_4
 ; AVX512F-NEXT:  ## BB#3: ## %cond.store1
@@ -8669,6 +8791,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $13, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_6
 ; AVX512F-NEXT:  ## BB#5: ## %cond.store3
@@ -8677,6 +8800,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_8
 ; AVX512F-NEXT:  ## BB#7: ## %cond.store5
@@ -8685,6 +8809,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $11, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_10
 ; AVX512F-NEXT:  ## BB#9: ## %cond.store7
@@ -8693,6 +8818,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $10, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_12
 ; AVX512F-NEXT:  ## BB#11: ## %cond.store9
@@ -8701,6 +8827,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $9, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_14
 ; AVX512F-NEXT:  ## BB#13: ## %cond.store11
@@ -8709,6 +8836,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_16
 ; AVX512F-NEXT:  ## BB#15: ## %cond.store13
@@ -8717,6 +8845,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $7, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_18
 ; AVX512F-NEXT:  ## BB#17: ## %cond.store15
@@ -8725,6 +8854,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $6, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_20
 ; AVX512F-NEXT:  ## BB#19: ## %cond.store17
@@ -8733,6 +8863,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $5, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_22
 ; AVX512F-NEXT:  ## BB#21: ## %cond.store19
@@ -8741,6 +8872,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $4, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_24
 ; AVX512F-NEXT:  ## BB#23: ## %cond.store21
@@ -8749,6 +8881,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $3, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_26
 ; AVX512F-NEXT:  ## BB#25: ## %cond.store23
@@ -8758,6 +8891,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $2, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_28
 ; AVX512F-NEXT:  ## BB#27: ## %cond.store25
@@ -8767,6 +8901,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $1, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_30
 ; AVX512F-NEXT:  ## BB#29: ## %cond.store27
@@ -8775,6 +8910,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_32
 ; AVX512F-NEXT:  ## BB#31: ## %cond.store29
@@ -8783,6 +8919,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_34
 ; AVX512F-NEXT:  ## BB#33: ## %cond.store31
@@ -8792,6 +8929,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $14, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_36
 ; AVX512F-NEXT:  ## BB#35: ## %cond.store33
@@ -8801,6 +8939,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $13, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_38
 ; AVX512F-NEXT:  ## BB#37: ## %cond.store35
@@ -8810,6 +8949,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $12, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_40
 ; AVX512F-NEXT:  ## BB#39: ## %cond.store37
@@ -8819,6 +8959,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $11, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_42
 ; AVX512F-NEXT:  ## BB#41: ## %cond.store39
@@ -8828,6 +8969,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $10, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_44
 ; AVX512F-NEXT:  ## BB#43: ## %cond.store41
@@ -8837,6 +8979,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $9, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_46
 ; AVX512F-NEXT:  ## BB#45: ## %cond.store43
@@ -8846,6 +8989,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $8, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_48
 ; AVX512F-NEXT:  ## BB#47: ## %cond.store45
@@ -8855,6 +8999,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $7, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_50
 ; AVX512F-NEXT:  ## BB#49: ## %cond.store47
@@ -8864,6 +9009,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $6, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_52
 ; AVX512F-NEXT:  ## BB#51: ## %cond.store49
@@ -8873,6 +9019,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $5, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_54
 ; AVX512F-NEXT:  ## BB#53: ## %cond.store51
@@ -8882,6 +9029,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $4, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_56
 ; AVX512F-NEXT:  ## BB#55: ## %cond.store53
@@ -8891,6 +9039,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $3, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_58
 ; AVX512F-NEXT:  ## BB#57: ## %cond.store55
@@ -8901,6 +9050,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $2, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_60
 ; AVX512F-NEXT:  ## BB#59: ## %cond.store57
@@ -8911,6 +9061,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $1, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_62
 ; AVX512F-NEXT:  ## BB#61: ## %cond.store59
@@ -8920,6 +9071,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_64
 ; AVX512F-NEXT:  ## BB#63: ## %cond.store61
@@ -8929,6 +9081,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $15, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_66
 ; AVX512F-NEXT:  ## BB#65: ## %cond.store63
@@ -8937,6 +9090,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_68
 ; AVX512F-NEXT:  ## BB#67: ## %cond.store65
@@ -8945,6 +9099,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $13, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_70
 ; AVX512F-NEXT:  ## BB#69: ## %cond.store67
@@ -8953,6 +9108,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_72
 ; AVX512F-NEXT:  ## BB#71: ## %cond.store69
@@ -8961,6 +9117,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $11, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_74
 ; AVX512F-NEXT:  ## BB#73: ## %cond.store71
@@ -8969,6 +9126,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $10, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_76
 ; AVX512F-NEXT:  ## BB#75: ## %cond.store73
@@ -8977,6 +9135,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $9, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_78
 ; AVX512F-NEXT:  ## BB#77: ## %cond.store75
@@ -8985,6 +9144,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_80
 ; AVX512F-NEXT:  ## BB#79: ## %cond.store77
@@ -8993,6 +9153,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $7, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_82
 ; AVX512F-NEXT:  ## BB#81: ## %cond.store79
@@ -9001,6 +9162,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $6, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_84
 ; AVX512F-NEXT:  ## BB#83: ## %cond.store81
@@ -9009,6 +9171,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $5, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_86
 ; AVX512F-NEXT:  ## BB#85: ## %cond.store83
@@ -9017,6 +9180,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $4, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_88
 ; AVX512F-NEXT:  ## BB#87: ## %cond.store85
@@ -9025,6 +9189,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $3, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_90
 ; AVX512F-NEXT:  ## BB#89: ## %cond.store87
@@ -9034,6 +9199,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $2, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_92
 ; AVX512F-NEXT:  ## BB#91: ## %cond.store89
@@ -9043,6 +9209,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $1, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_94
 ; AVX512F-NEXT:  ## BB#93: ## %cond.store91
@@ -9051,6 +9218,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_96
 ; AVX512F-NEXT:  ## BB#95: ## %cond.store93
@@ -9059,6 +9227,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_98
 ; AVX512F-NEXT:  ## BB#97: ## %cond.store95
@@ -9068,6 +9237,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $14, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_100
 ; AVX512F-NEXT:  ## BB#99: ## %cond.store97
@@ -9077,6 +9247,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $13, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_102
 ; AVX512F-NEXT:  ## BB#101: ## %cond.store99
@@ -9086,6 +9257,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $12, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_104
 ; AVX512F-NEXT:  ## BB#103: ## %cond.store101
@@ -9095,6 +9267,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $11, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_106
 ; AVX512F-NEXT:  ## BB#105: ## %cond.store103
@@ -9104,6 +9277,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $10, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_108
 ; AVX512F-NEXT:  ## BB#107: ## %cond.store105
@@ -9113,6 +9287,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $9, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_110
 ; AVX512F-NEXT:  ## BB#109: ## %cond.store107
@@ -9122,6 +9297,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $8, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_112
 ; AVX512F-NEXT:  ## BB#111: ## %cond.store109
@@ -9131,6 +9307,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $7, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_114
 ; AVX512F-NEXT:  ## BB#113: ## %cond.store111
@@ -9140,6 +9317,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $6, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_116
 ; AVX512F-NEXT:  ## BB#115: ## %cond.store113
@@ -9149,6 +9327,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $5, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_118
 ; AVX512F-NEXT:  ## BB#117: ## %cond.store115
@@ -9158,6 +9337,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $4, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_120
 ; AVX512F-NEXT:  ## BB#119: ## %cond.store117
@@ -9167,6 +9347,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $3, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_122
 ; AVX512F-NEXT:  ## BB#121: ## %cond.store119
@@ -9176,6 +9357,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $2, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_124
 ; AVX512F-NEXT:  ## BB#123: ## %cond.store121
@@ -9185,6 +9367,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:    kshiftlw $1, %k1, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_126
 ; AVX512F-NEXT:  ## BB#125: ## %cond.store123
@@ -9193,6 +9376,7 @@ define void @test_mask_store_64xi8(<64 x
 ; AVX512F-NEXT:  LBB58_126: ## %else124
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB58_128
 ; AVX512F-NEXT:  ## BB#127: ## %cond.store125
@@ -9273,6 +9457,7 @@ define void @test_mask_store_8xi16(<8 x
 ; AVX512F-NEXT:    kshiftlw $15, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB59_2
 ; AVX512F-NEXT:  ## BB#1: ## %cond.store
@@ -9281,6 +9466,7 @@ define void @test_mask_store_8xi16(<8 x
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB59_4
 ; AVX512F-NEXT:  ## BB#3: ## %cond.store1
@@ -9289,6 +9475,7 @@ define void @test_mask_store_8xi16(<8 x
 ; AVX512F-NEXT:    kshiftlw $13, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB59_6
 ; AVX512F-NEXT:  ## BB#5: ## %cond.store3
@@ -9297,6 +9484,7 @@ define void @test_mask_store_8xi16(<8 x
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB59_8
 ; AVX512F-NEXT:  ## BB#7: ## %cond.store5
@@ -9305,6 +9493,7 @@ define void @test_mask_store_8xi16(<8 x
 ; AVX512F-NEXT:    kshiftlw $11, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB59_10
 ; AVX512F-NEXT:  ## BB#9: ## %cond.store7
@@ -9313,6 +9502,7 @@ define void @test_mask_store_8xi16(<8 x
 ; AVX512F-NEXT:    kshiftlw $10, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB59_12
 ; AVX512F-NEXT:  ## BB#11: ## %cond.store9
@@ -9321,6 +9511,7 @@ define void @test_mask_store_8xi16(<8 x
 ; AVX512F-NEXT:    kshiftlw $9, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB59_14
 ; AVX512F-NEXT:  ## BB#13: ## %cond.store11
@@ -9329,6 +9520,7 @@ define void @test_mask_store_8xi16(<8 x
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB59_16
 ; AVX512F-NEXT:  ## BB#15: ## %cond.store13
@@ -9574,6 +9766,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $15, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_2
 ; AVX512F-NEXT:  ## BB#1: ## %cond.store
@@ -9582,6 +9775,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_4
 ; AVX512F-NEXT:  ## BB#3: ## %cond.store1
@@ -9590,6 +9784,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $13, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_6
 ; AVX512F-NEXT:  ## BB#5: ## %cond.store3
@@ -9598,6 +9793,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_8
 ; AVX512F-NEXT:  ## BB#7: ## %cond.store5
@@ -9606,6 +9802,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $11, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_10
 ; AVX512F-NEXT:  ## BB#9: ## %cond.store7
@@ -9614,6 +9811,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $10, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_12
 ; AVX512F-NEXT:  ## BB#11: ## %cond.store9
@@ -9622,6 +9820,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $9, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_14
 ; AVX512F-NEXT:  ## BB#13: ## %cond.store11
@@ -9630,6 +9829,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_16
 ; AVX512F-NEXT:  ## BB#15: ## %cond.store13
@@ -9638,6 +9838,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $7, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_18
 ; AVX512F-NEXT:  ## BB#17: ## %cond.store15
@@ -9647,6 +9848,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $6, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_20
 ; AVX512F-NEXT:  ## BB#19: ## %cond.store17
@@ -9656,6 +9858,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $5, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_22
 ; AVX512F-NEXT:  ## BB#21: ## %cond.store19
@@ -9665,6 +9868,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $4, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_24
 ; AVX512F-NEXT:  ## BB#23: ## %cond.store21
@@ -9674,6 +9878,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $3, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_26
 ; AVX512F-NEXT:  ## BB#25: ## %cond.store23
@@ -9683,6 +9888,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $2, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_28
 ; AVX512F-NEXT:  ## BB#27: ## %cond.store25
@@ -9692,6 +9898,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:    kshiftlw $1, %k0, %k1
 ; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_30
 ; AVX512F-NEXT:  ## BB#29: ## %cond.store27
@@ -9700,6 +9907,7 @@ define void @test_mask_store_16xi16(<16
 ; AVX512F-NEXT:  LBB60_30: ## %else28
 ; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $1, %eax
 ; AVX512F-NEXT:    testb %al, %al
 ; AVX512F-NEXT:    je LBB60_32
 ; AVX512F-NEXT:  ## BB#31: ## %cond.store29

Modified: llvm/trunk/test/CodeGen/X86/pr27591.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr27591.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr27591.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr27591.ll Thu Aug 25 16:55:41 2016
@@ -8,8 +8,9 @@ define void @test1(i32 %x) #0 {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    kmovw %ecx, %k0
+; CHECK-NEXT:    movb %al, %dil
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    kmovb %k0, %eax
 ; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    movzbl %al, %edi
@@ -28,14 +29,16 @@ define void @test2(i32 %x) #0 {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    kmovw %ecx, %k0
-; CHECK-NEXT:    kmovw %k0, %ecx
-; CHECK-NEXT:    movb %cl, %al
+; CHECK-NEXT:    movb %al, %dil
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    movb %dil, %al
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    movl $-1, %edx
-; CHECK-NEXT:    cmovnel %edx, %edi
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %edi
 ; CHECK-NEXT:    callq callee2
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/pr28173.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr28173.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr28173.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr28173.ll Thu Aug 25 16:55:41 2016
@@ -5,13 +5,12 @@ target triple = "x86_64-unknown-linux-gn
 ; Note that the kmovs should really *not* appear in the output, this is an
 ; artifact of the current poor lowering. This is tracked by PR28175.
 
+; CHECK-LABEL: @foo64
+; CHECK: kmov
+; CHECK: kmov
+; CHECK: orq  $-2, %rax
+; CHECK: ret
 define i64 @foo64(i1 zeroext %i, i32 %j) #0 {
-; CHECK-LABEL: foo64:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    # kill
-; CHECK-NEXT:    orq $-2, %rdi
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    retq
   br label %bb
 
 bb:
@@ -23,12 +22,12 @@ end:
   ret i64 %v
 }
 
+; CHECK-LABEL: @foo16
+; CHECK: kmov
+; CHECK: kmov
+; CHECK: orl $65534, %eax
+; CHECK: retq
 define i16 @foo16(i1 zeroext %i, i32 %j) #0 {
-; CHECK-LABEL: foo16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    orl $65534, %edi # imm = 0xFFFE
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
   br label %bb
 
 bb:

Modified: llvm/trunk/test/CodeGen/X86/xaluo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/xaluo.ll?rev=279782&r1=279781&r2=279782&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/xaluo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/xaluo.ll Thu Aug 25 16:55:41 2016
@@ -738,10 +738,10 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
 ; KNL-LABEL: bug27873:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    andl $1, %esi
-; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    movl $160, %ecx
 ; KNL-NEXT:    movq %rdi, %rax
 ; KNL-NEXT:    mulq %rcx
+; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    seto %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0




More information about the llvm-commits mailing list