[llvm] r274615 - Reverted 274613 due to compilation failue.

Elena Demikhovsky via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 6 02:11:49 PDT 2016


Author: delena
Date: Wed Jul  6 04:11:49 2016
New Revision: 274615

URL: http://llvm.org/viewvc/llvm-project?rev=274615&view=rev
Log:
Reverted 274613 due to compilation failue. 

Modified:
    llvm/trunk/lib/Target/X86/X86FastISel.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/test/CodeGen/X86/avx512-cmp.ll
    llvm/trunk/test/CodeGen/X86/avx512-ext.ll
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/trunk/test/CodeGen/X86/pr27591.ll
    llvm/trunk/test/CodeGen/X86/pr28173.ll
    llvm/trunk/test/CodeGen/X86/xaluo.ll

Modified: llvm/trunk/lib/Target/X86/X86FastISel.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86FastISel.cpp?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86FastISel.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86FastISel.cpp Wed Jul  6 04:11:49 2016
@@ -1404,9 +1404,6 @@ bool X86FastISel::X86SelectCmp(const Ins
   if (!isTypeLegal(I->getOperand(0)->getType(), VT))
     return false;
 
-  if (I->getType()->isIntegerTy(1) && Subtarget->hasAVX512())
-    return false;
-
   // Try to optimize or fold the cmp.
   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
   unsigned ResultReg = 0;

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jul  6 04:11:49 2016
@@ -15551,11 +15551,8 @@ SDValue X86TargetLowering::LowerSETCC(SD
       isNullConstant(Op1) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
-      if (VT == MVT::i1) {
-        NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
-                               DAG.getValueType(MVT::i1));
+      if (VT == MVT::i1)
         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
-      }
       return NewSetCC;
     }
   }
@@ -15577,11 +15574,8 @@ SDValue X86TargetLowering::LowerSETCC(SD
       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                                   DAG.getConstant(CCode, dl, MVT::i8),
                                   Op0.getOperand(1));
-      if (VT == MVT::i1) {
-        SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
-                            DAG.getValueType(MVT::i1));
+      if (VT == MVT::i1)
         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
-      }
       return SetCC;
     }
   }
@@ -15605,11 +15599,8 @@ SDValue X86TargetLowering::LowerSETCC(SD
   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                               DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
-  if (VT == MVT::i1) {
-    SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
-                        DAG.getValueType(MVT::i1));
+  if (VT == MVT::i1)
     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
-  }
   return SetCC;
 }
 
@@ -15628,11 +15619,8 @@ SDValue X86TargetLowering::LowerSETCCE(S
   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
   SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                               DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
-  if (Op.getSimpleValueType() == MVT::i1) {
-    SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
-                        DAG.getValueType(MVT::i1));
-    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
-  }
+  if (Op.getSimpleValueType() == MVT::i1)
+      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
   return SetCC;
 }
 
@@ -15662,23 +15650,14 @@ static bool isX86LogicalCmp(SDValue Op)
   return false;
 }
 
-/// Returns the "condition" node, that may be wrapped with "truncate".
-/// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
-static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   if (V.getOpcode() != ISD::TRUNCATE)
-    return V;
+    return false;
 
   SDValue VOp0 = V.getOperand(0);
-  if (VOp0.getOpcode() == ISD::AssertZext &&
-      V.getValueSizeInBits() ==
-      cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
-    return VOp0.getOperand(0);
-
   unsigned InBits = VOp0.getValueSizeInBits();
   unsigned Bits = V.getValueSizeInBits();
-  if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
-    return V.getOperand(0);
-  return V;
+  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
 }
 
 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -15901,7 +15880,8 @@ SDValue X86TargetLowering::LowerSELECT(S
 
   if (addTest) {
     // Look past the truncate if the high bits are known zero.
-    Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+      Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
@@ -16739,7 +16719,8 @@ SDValue X86TargetLowering::LowerBRCOND(S
 
   if (addTest) {
     // Look pass the truncate if the high bits are known zero.
-    Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
@@ -17999,7 +17980,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(S
     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
-    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
@@ -20513,15 +20494,10 @@ static SDValue LowerXALUO(SDValue Op, Se
     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
 
     SDValue SetCC =
-      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+      DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
                   DAG.getConstant(X86::COND_O, DL, MVT::i32),
                   SDValue(Sum.getNode(), 2));
 
-    if (N->getValueType(1) == MVT::i1) {
-      SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
-                          DAG.getValueType(MVT::i1));
-      SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
-    }
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   }
   }
@@ -20531,15 +20507,10 @@ static SDValue LowerXALUO(SDValue Op, Se
   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
 
   SDValue SetCC =
-    DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
                 DAG.getConstant(Cond, DL, MVT::i32),
                 SDValue(Sum.getNode(), 1));
-  
-  if (N->getValueType(1) == MVT::i1) {
-    SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
-                        DAG.getValueType(MVT::i1));
-    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
-  }
+
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
@@ -26899,7 +26870,6 @@ static SDValue checkBoolTestSetCCCombine
   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
          SetCC.getOpcode() == ISD::TRUNCATE ||
-         SetCC.getOpcode() == ISD::AssertZext ||
          SetCC.getOpcode() == ISD::AND) {
     if (SetCC.getOpcode() == ISD::AND) {
       int OpIdx = -1;

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Wed Jul  6 04:11:49 2016
@@ -2083,65 +2083,51 @@ let Predicates = [HasBWI] in {
             (KMOVQkm addr:$src)>;
 }
 
-def assertzext_i1 : PatFrag<(ops node:$src), (assertzext node:$src), [{
-  return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1;
-}]>;
-
 let Predicates = [HasAVX512] in {
   def : Pat<(i1 (trunc (i64 GR64:$src))),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND64ri8 $src, (i64 1)),
-                                    sub_16bit)), VK1)>;
-
-  def : Pat<(i1 (trunc (i64 (assertzext_i1 GR64:$src)))),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
+                                        (i32 1))), VK1)>;
 
   def : Pat<(i1 (trunc (i32 GR32:$src))),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND32ri8 $src, (i32 1)),
-                                    sub_16bit)), VK1)>;
-
-  def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
-            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>;
 
   def : Pat<(i1 (trunc (i8 GR8:$src))),
-            (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), (AND8ri8 $src, (i8 1)),
-                                    sub_8bit)), VK1)>;
-
-  def : Pat<(i1 (trunc (i8 (assertzext_i1 GR8:$src)))),
-            (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>;
-
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))),
+       VK1)>;
   def : Pat<(i1 (trunc (i16 GR16:$src))),
-            (COPY_TO_REGCLASS (i16 (AND16ri8 $src, (i16 1))), VK1)>;
-
-  def : Pat<(i1 (trunc (i16 (assertzext_i1 GR16:$src)))),
-            (COPY_TO_REGCLASS $src, VK1)>;
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))),
+       VK1)>;
 
   def : Pat<(i32 (zext VK1:$src)),
-            (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
-                  sub_16bit))>;
-
+            (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
   def : Pat<(i32 (anyext VK1:$src)),
-            (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
-                  sub_16bit))>;
+            (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>;
 
   def : Pat<(i8 (zext VK1:$src)),
-            (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS VK1:$src, GR16)), sub_8bit))>;
-
+            (EXTRACT_SUBREG
+             (AND32ri8 (KMOVWrk
+                        (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
   def : Pat<(i8 (anyext VK1:$src)),
-            (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS $src, GR16)), sub_8bit))>;
+              (EXTRACT_SUBREG
+                (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>;
 
   def : Pat<(i64 (zext VK1:$src)),
-            (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
-                  sub_16bit))>;
-
+            (AND64ri8 (SUBREG_TO_REG (i64 0),
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
   def : Pat<(i64 (anyext VK1:$src)),
-            (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
-                  sub_16bit))>;
+            (SUBREG_TO_REG (i64 0),
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit)>;
 
   def : Pat<(i16 (zext VK1:$src)),
-            (COPY_TO_REGCLASS $src, GR16)>;
-
+            (EXTRACT_SUBREG
+             (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
+              sub_16bit)>;
   def : Pat<(i16 (anyext VK1:$src)),
-            (i16 (COPY_TO_REGCLASS $src, GR16))>;
+            (EXTRACT_SUBREG
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
+              sub_16bit)>;
 }
 def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
           (COPY_TO_REGCLASS VK1:$src, VK16)>;

Modified: llvm/trunk/test/CodeGen/X86/avx512-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cmp.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cmp.ll Wed Jul  6 04:11:49 2016
@@ -163,10 +163,12 @@ define i32 @test10(i64 %b, i64 %c, i1 %d
 ; ALL-NEXT:    kmovw %edx, %k0
 ; ALL-NEXT:    cmpq %rsi, %rdi
 ; ALL-NEXT:    sete %al
+; ALL-NEXT:    andl $1, %eax
 ; ALL-NEXT:    kmovw %eax, %k1
 ; ALL-NEXT:    korw %k1, %k0, %k1
 ; ALL-NEXT:    kxorw %k1, %k0, %k0
 ; ALL-NEXT:    kmovw %k0, %eax
+; ALL-NEXT:    andl $1, %eax
 ; ALL-NEXT:    testb %al, %al
 ; ALL-NEXT:    je LBB8_1
 ; ALL-NEXT:  ## BB#2: ## %if.end.i

Modified: llvm/trunk/test/CodeGen/X86/avx512-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-ext.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-ext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-ext.ll Wed Jul  6 04:11:49 2016
@@ -1513,264 +1513,265 @@ define <64 x i16> @test21(<64 x i16> %x
 ; KNL-NEXT:    vptestmd %zmm4, %zmm4, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r12d
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %edi
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    kmovw %k1, %esi
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT:    kmovw %k1, %r13d
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    kmovw %k1, %r8d
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edi
+; KNL-NEXT:    kmovw %k1, %r10d
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    kmovw %k1, %r11d
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    kmovw %k1, %ebx
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    kmovw %k1, %ebp
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    kmovw %k1, %r14d
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebx
+; KNL-NEXT:    kmovw %k1, %r15d
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebp
+; KNL-NEXT:    kmovw %k1, %r9d
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r14d
-; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k2
+; KNL-NEXT:    kmovw %k1, %r12d
+; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k1
 ; KNL-NEXT:    kshiftlw $0, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %r15d, %xmm4
-; KNL-NEXT:    kmovw %k0, %r15d
-; KNL-NEXT:    kshiftlw $14, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    kshiftlw $15, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $2, %r12d, %xmm4, %xmm4
+; KNL-NEXT:    vmovd %eax, %xmm4
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    kshiftlw $13, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $3, %edx, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r12d
-; KNL-NEXT:    kshiftlw $12, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $4, %r13d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %edx
-; KNL-NEXT:    kshiftlw $11, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
-; KNL-NEXT:    kmovw %k0, %r13d
-; KNL-NEXT:    kshiftlw $10, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $6, %esi, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %esi
-; KNL-NEXT:    movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; KNL-NEXT:    kshiftlw $9, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $7, %edi, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %esi
-; KNL-NEXT:    kshiftlw $8, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $8, %r8d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %edi
-; KNL-NEXT:    kshiftlw $7, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $9, %r9d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r8d
-; KNL-NEXT:    kshiftlw $6, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $10, %r10d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r9d
-; KNL-NEXT:    kshiftlw $5, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $11, %r11d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r10d
-; KNL-NEXT:    kshiftlw $4, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $12, %ebx, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %ebx
-; KNL-NEXT:    kshiftlw $3, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $13, %ebp, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %ebp
-; KNL-NEXT:    kshiftlw $2, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $14, %r14d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r11d
-; KNL-NEXT:    kshiftlw $1, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $15, %r15d, %xmm4, %xmm4
-; KNL-NEXT:    kmovw %k0, %r14d
-; KNL-NEXT:    vptestmd %zmm6, %zmm6, %k1
-; KNL-NEXT:    kshiftlw $0, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %eax, %xmm5
-; KNL-NEXT:    kmovw %k0, %r15d
 ; KNL-NEXT:    kshiftlw $14, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vpinsrb $1, %edx, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
 ; KNL-NEXT:    kshiftlw $15, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $2, %r12d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $13, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $3, %edx, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    vpinsrb $3, %edi, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %edi
 ; KNL-NEXT:    kshiftlw $12, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $4, %r13d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    vpinsrb $4, %esi, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %esi
 ; KNL-NEXT:    kshiftlw $11, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; KNL-NEXT:    vpinsrb $5, %r13d, %xmm4, %xmm4
 ; KNL-NEXT:    kmovw %k0, %r13d
 ; KNL-NEXT:    kshiftlw $10, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $6, %esi, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %esi
-; KNL-NEXT:    movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT:    vpinsrb $6, %r8d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r8d
 ; KNL-NEXT:    kshiftlw $9, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $7, %edi, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %esi
+; KNL-NEXT:    vpinsrb $7, %r10d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r10d
 ; KNL-NEXT:    kshiftlw $8, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $8, %r8d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %edi
+; KNL-NEXT:    vpinsrb $8, %r11d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r11d
 ; KNL-NEXT:    kshiftlw $7, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $9, %r9d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %r8d
+; KNL-NEXT:    vpinsrb $9, %ebx, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %ebx
 ; KNL-NEXT:    kshiftlw $6, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $10, %r10d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    vpinsrb $10, %ebp, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %ebp
 ; KNL-NEXT:    kshiftlw $5, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $11, %ebx, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %ebx
+; KNL-NEXT:    vpinsrb $11, %r14d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r14d
 ; KNL-NEXT:    kshiftlw $4, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $12, %ebp, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %ebp
+; KNL-NEXT:    vpinsrb $12, %r15d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r15d
 ; KNL-NEXT:    kshiftlw $3, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $13, %r11d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    vpinsrb $13, %r9d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
 ; KNL-NEXT:    kshiftlw $2, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $14, %r14d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %r11d
+; KNL-NEXT:    vpinsrb $14, %r12d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r12d
 ; KNL-NEXT:    kshiftlw $1, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $15, %r15d, %xmm5, %xmm5
-; KNL-NEXT:    kmovw %k0, %r14d
-; KNL-NEXT:    vptestmd %zmm7, %zmm7, %k0
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    vptestmd %zmm6, %zmm6, %k0
 ; KNL-NEXT:    kshiftlw $0, %k1, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vmovd %eax, %xmm6
-; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    vmovd %ecx, %xmm5
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $2, %r12d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %r12d
+; KNL-NEXT:    vpinsrb $2, %edi, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $3, %edx, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    vpinsrb $3, %esi, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %edi
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $4, %r13d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    vpinsrb $4, %r13d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $5, %r8d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r8d
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $6, %esi, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    vpinsrb $6, %r10d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r13d
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $7, %edi, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %edi
+; KNL-NEXT:    vpinsrb $7, %r11d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $8, %r8d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    vpinsrb $8, %ebx, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %ebx
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $9, %r9d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    vpinsrb $9, %ebp, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %ebp
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $10, %ebx, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %ebx
+; KNL-NEXT:    vpinsrb $10, %r14d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r10d
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $11, %ebp, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %ebp
+; KNL-NEXT:    vpinsrb $11, %r15d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r11d
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $12, %r10d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k1, %esi
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $13, %r11d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    vpinsrb $13, %r12d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r14d
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $14, %r14d, %xmm6, %xmm6
-; KNL-NEXT:    kmovw %k1, %r14d
+; KNL-NEXT:    vpinsrb $14, %r9d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r9d
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $15, %r15d, %xmm6, %xmm6
+; KNL-NEXT:    vpinsrb $15, %edx, %xmm5, %xmm5
 ; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    vptestmd %zmm7, %zmm7, %k1
 ; KNL-NEXT:    kshiftlw $0, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %r12d, %xmm7
+; KNL-NEXT:    vmovd %eax, %xmm6
+; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    kshiftlw $14, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $15, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $2, %edi, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kshiftlw $13, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $12, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $4, %r8d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r8d
+; KNL-NEXT:    kshiftlw $11, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $5, %r13d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r13d
+; KNL-NEXT:    kshiftlw $10, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k0, %edi
+; KNL-NEXT:    kshiftlw $9, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $7, %ebx, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %ebx
+; KNL-NEXT:    kshiftlw $8, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $8, %ebp, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %ebp
+; KNL-NEXT:    kshiftlw $7, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $9, %r10d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    kshiftlw $6, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $10, %r11d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r11d
+; KNL-NEXT:    kshiftlw $5, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $11, %esi, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %esi
+; KNL-NEXT:    kshiftlw $4, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $12, %r14d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r14d
+; KNL-NEXT:    kshiftlw $3, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $13, %r9d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    kshiftlw $2, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $14, %r15d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r15d
+; KNL-NEXT:    kshiftlw $1, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $15, %r12d, %xmm6, %xmm6
 ; KNL-NEXT:    kmovw %k0, %r12d
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $2, %edx, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $3, %r13d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $5, %esi, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $6, %edi, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $7, %r8d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $8, %r9d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $9, %ebx, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $10, %ebp, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $11, %r10d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $12, %r11d, %xmm7, %xmm7
-; KNL-NEXT:    vpinsrb $13, %r14d, %xmm7, %xmm7
+; KNL-NEXT:    kshiftlw $0, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vmovd %edx, %xmm7
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $3, %r8d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $4, %r13d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $5, %edi, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $6, %ebx, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $7, %ebp, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $8, %r10d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $9, %r11d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $10, %esi, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $11, %r14d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $12, %r9d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $13, %r15d, %xmm7, %xmm7
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
 ; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4
@@ -1783,8 +1784,8 @@ define <64 x i16> @test21(<64 x i16> %x
 ; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
 ; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4
 ; KNL-NEXT:    vpand %ymm2, %ymm4, %ymm2
-; KNL-NEXT:    vpinsrb $14, %r15d, %xmm7, %xmm4
-; KNL-NEXT:    vpinsrb $15, %r12d, %xmm4, %xmm4
+; KNL-NEXT:    vpinsrb $14, %r12d, %xmm7, %xmm4
+; KNL-NEXT:    vpinsrb $15, %edx, %xmm4, %xmm4
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
 ; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Wed Jul  6 04:11:49 2016
@@ -159,6 +159,7 @@ define i64 @test12(<16 x i64>%a, <16 x i
 ;CHECK-LABEL: test13
 ;CHECK: cmpl    %esi, %edi
 ;CHECK: setb    %al
+;CHECK: andl    $1, %eax
 ;CHECK: kmovw   %eax, %k0
 ;CHECK: movw    $-4
 ;CHECK: korw

Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Wed Jul  6 04:11:49 2016
@@ -9,7 +9,9 @@ define i32 @test_kortestz(i16 %a0, i16 %
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    kortestw %k0, %k1
 ; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    kmovw %eax, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
   ret i32 %res
@@ -5089,6 +5091,7 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
 
   %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
@@ -5109,6 +5112,7 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    kandw %k2, %k1, %k1
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
 
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
@@ -5131,6 +5135,7 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmpunordss %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
 
   %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
@@ -5148,8 +5153,9 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k2 {%k1}
 ; CHECK-NEXT:    kmovw %k2, %ecx
 ; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    kmovw %k1, %edx
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    andb %cl, %al
 ; CHECK-NEXT:    andb %dl, %al
 ; CHECK-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Wed Jul  6 04:11:49 2016
@@ -173,35 +173,18 @@ define i32 @zext_test1(<16 x i32> %a, <1
 ; CHECK-NEXT:    kshiftlw $10, %k0, %k0
 ; CHECK-NEXT:    kshiftrw $15, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i32
   ret i32 %res
-}
-
-define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test2:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kshiftlw $10, %k0, %k0
-; CHECK-NEXT:    kshiftrw $15, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+}define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i16
   ret i16 %res
-}
-
-define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test3:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kshiftlw $10, %k0, %k0
-; CHECK-NEXT:    kshiftrw $15, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+}define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i8
@@ -596,6 +579,7 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 ; SKX-NEXT:    kmovq %rdi, %k0
 ; SKX-NEXT:    cmpl %edx, %esi
 ; SKX-NEXT:    setg %al
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    kmovw %eax, %k1
 ; SKX-NEXT:    kshiftlq $5, %k1, %k1
 ; SKX-NEXT:    korq %k1, %k0, %k0
@@ -1639,10 +1623,10 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k1, %r8d
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    kmovw %k1, %r10d
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    kmovw %k1, %r9d
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %r11d
@@ -1669,22 +1653,22 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %esi
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vmovd %r9d, %xmm3
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    vmovd %r10d, %xmm3
+; KNL-NEXT:    kmovw %k1, %r10d
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k2
 ; KNL-NEXT:    kshiftlw $0, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    vpinsrb $1, %r8d, %xmm3, %xmm2
-; KNL-NEXT:    vpinsrb $2, %r10d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $2, %r9d, %xmm2, %xmm2
 ; KNL-NEXT:    vpinsrb $3, %r11d, %xmm2, %xmm2
 ; KNL-NEXT:    vpinsrb $4, %r14d, %xmm2, %xmm2
 ; KNL-NEXT:    vpinsrb $5, %r15d, %xmm2, %xmm2
@@ -1693,10 +1677,10 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    vpinsrb $8, %ebx, %xmm2, %xmm2
 ; KNL-NEXT:    vpinsrb $9, %ebp, %xmm2, %xmm2
 ; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $12, %edx, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $11, %edx, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
 ; KNL-NEXT:    vpinsrb $13, %esi, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $14, %r9d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $14, %r10d, %xmm2, %xmm2
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
 ; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
@@ -1729,7 +1713,7 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, %r13d
 ; KNL-NEXT:    kshiftlw $7, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $6, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %esi
@@ -1744,7 +1728,7 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $2, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $1, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    vmovd %r10d, %xmm2
@@ -1759,12 +1743,12 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    vpinsrb $5, %r15d, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $6, %r12d, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $7, %r13d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $8, %edx, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $9, %esi, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $10, %ebp, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $11, %ebx, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm1
 ; KNL-NEXT:    vpinsrb $14, %r10d, %xmm1, %xmm1
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
@@ -1798,7 +1782,7 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, %r13d
 ; KNL-NEXT:    kshiftlw $7, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $6, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %esi
@@ -1813,7 +1797,7 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $2, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $1, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    vmovd %r10d, %xmm1
@@ -1828,12 +1812,12 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $8, %edx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
 ; KNL-NEXT:    vpinsrb $14, %r10d, %xmm0, %xmm0
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0

Modified: llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll Wed Jul  6 04:11:49 2016
@@ -490,6 +490,7 @@ define i8 @test_int_x86_avx512_mask_fpcl
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfpclasssd $2, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    je LBB28_2
 ; CHECK-NEXT:  ## BB#1:
@@ -497,6 +498,7 @@ define i8 @test_int_x86_avx512_mask_fpcl
 ; CHECK-NEXT:  LBB28_2:
 ; CHECK-NEXT:    vfpclasssd $4, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    testb %cl, %cl
 ; CHECK-NEXT:    je LBB28_4
 ; CHECK-NEXT:  ## BB#3:
@@ -519,6 +521,7 @@ define i8 @test_int_x86_avx512_mask_fpcl
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfpclassss $4, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    je LBB29_2
 ; CHECK-NEXT:  ## BB#1:
@@ -526,6 +529,7 @@ define i8 @test_int_x86_avx512_mask_fpcl
 ; CHECK-NEXT:  LBB29_2:
 ; CHECK-NEXT:    vfpclassss $4, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    testb %cl, %cl
 ; CHECK-NEXT:    je LBB29_4
 ; CHECK-NEXT:  ## BB#3:

Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Wed Jul  6 04:11:49 2016
@@ -1367,9 +1367,12 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; KNL_64-LABEL: test30:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    andl $1, %edx
+; KNL_64-NEXT:    kmovw %edx, %k1
 ; KNL_64-NEXT:    andl $1, %esi
+; KNL_64-NEXT:    kmovw %esi, %k2
 ; KNL_64-NEXT:    movl %edi, %eax
 ; KNL_64-NEXT:    andl $1, %eax
+; KNL_64-NEXT:    kmovw %eax, %k0
 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
 ; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
@@ -1377,76 +1380,81 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; KNL_64-NEXT:    testb $1, %dil
 ; KNL_64-NEXT:    je .LBB29_2
 ; KNL_64-NEXT:  # BB#1: # %cond.load
-; KNL_64-NEXT:    vmovq %xmm1, %rcx
-; KNL_64-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL_64-NEXT:    vmovq %xmm1, %rax
+; KNL_64-NEXT:    vmovd (%rax), %xmm0
 ; KNL_64-NEXT:  .LBB29_2: # %else
-; KNL_64-NEXT:    testb %sil, %sil
+; KNL_64-NEXT:    kmovw %k2, %eax
+; KNL_64-NEXT:    movl %eax, %ecx
+; KNL_64-NEXT:    andl $1, %ecx
+; KNL_64-NEXT:    testb %cl, %cl
 ; KNL_64-NEXT:    je .LBB29_4
 ; KNL_64-NEXT:  # BB#3: # %cond.load1
 ; KNL_64-NEXT:    vpextrq $1, %xmm1, %rcx
 ; KNL_64-NEXT:    vpinsrd $1, (%rcx), %xmm0, %xmm0
 ; KNL_64-NEXT:  .LBB29_4: # %else2
+; KNL_64-NEXT:    kmovw %k1, %ecx
+; KNL_64-NEXT:    movl %ecx, %edx
+; KNL_64-NEXT:    andl $1, %edx
 ; KNL_64-NEXT:    testb %dl, %dl
 ; KNL_64-NEXT:    je .LBB29_6
 ; KNL_64-NEXT:  # BB#5: # %cond.load4
 ; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; KNL_64-NEXT:    vmovq %xmm1, %rcx
-; KNL_64-NEXT:    vpinsrd $2, (%rcx), %xmm0, %xmm0
+; KNL_64-NEXT:    vmovq %xmm1, %rdx
+; KNL_64-NEXT:    vpinsrd $2, (%rdx), %xmm0, %xmm0
 ; KNL_64-NEXT:  .LBB29_6: # %else5
-; KNL_64-NEXT:    vmovd %eax, %xmm1
-; KNL_64-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
-; KNL_64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
+; KNL_64-NEXT:    kmovw %k0, %edx
+; KNL_64-NEXT:    vmovd %edx, %xmm1
+; KNL_64-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_64-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
 ; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_64-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test30:
 ; KNL_32:       # BB#0:
-; KNL_32-NEXT:    pushl %ebx
-; KNL_32-NEXT:  .Ltmp0:
-; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:    pushl %esi
-; KNL_32-NEXT:  .Ltmp1:
-; KNL_32-NEXT:    .cfi_def_cfa_offset 12
-; KNL_32-NEXT:  .Ltmp2:
-; KNL_32-NEXT:    .cfi_offset %esi, -12
-; KNL_32-NEXT:  .Ltmp3:
-; KNL_32-NEXT:    .cfi_offset %ebx, -8
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    andl $1, %eax
-; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; KNL_32-NEXT:    kmovw %eax, %k1
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    andl $1, %eax
+; KNL_32-NEXT:    kmovw %eax, %k2
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    movl %eax, %ecx
 ; KNL_32-NEXT:    andl $1, %ecx
-; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; KNL_32-NEXT:    movl %ebx, %edx
-; KNL_32-NEXT:    andl $1, %edx
+; KNL_32-NEXT:    kmovw %ecx, %k0
 ; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; KNL_32-NEXT:    # implicit-def: %XMM0
-; KNL_32-NEXT:    testb $1, %bl
+; KNL_32-NEXT:    testb $1, %al
 ; KNL_32-NEXT:    je .LBB29_2
 ; KNL_32-NEXT:  # BB#1: # %cond.load
-; KNL_32-NEXT:    vmovd %xmm1, %esi
-; KNL_32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL_32-NEXT:    vmovd %xmm1, %eax
+; KNL_32-NEXT:    vmovd (%eax), %xmm0
 ; KNL_32-NEXT:  .LBB29_2: # %else
+; KNL_32-NEXT:    kmovw %k2, %eax
+; KNL_32-NEXT:    movl %eax, %ecx
+; KNL_32-NEXT:    andl $1, %ecx
 ; KNL_32-NEXT:    testb %cl, %cl
 ; KNL_32-NEXT:    je .LBB29_4
 ; KNL_32-NEXT:  # BB#3: # %cond.load1
-; KNL_32-NEXT:    vpextrd $1, %xmm1, %esi
-; KNL_32-NEXT:    vpinsrd $1, (%esi), %xmm0, %xmm0
+; KNL_32-NEXT:    vpextrd $1, %xmm1, %ecx
+; KNL_32-NEXT:    vpinsrd $1, (%ecx), %xmm0, %xmm0
 ; KNL_32-NEXT:  .LBB29_4: # %else2
-; KNL_32-NEXT:    testb %al, %al
+; KNL_32-NEXT:    kmovw %k1, %ecx
+; KNL_32-NEXT:    movl %ecx, %edx
+; KNL_32-NEXT:    andl $1, %edx
+; KNL_32-NEXT:    testb %dl, %dl
 ; KNL_32-NEXT:    je .LBB29_6
 ; KNL_32-NEXT:  # BB#5: # %cond.load4
-; KNL_32-NEXT:    vpextrd $2, %xmm1, %esi
-; KNL_32-NEXT:    vpinsrd $2, (%esi), %xmm0, %xmm0
+; KNL_32-NEXT:    vpextrd $2, %xmm1, %edx
+; KNL_32-NEXT:    vpinsrd $2, (%edx), %xmm0, %xmm0
 ; KNL_32-NEXT:  .LBB29_6: # %else5
+; KNL_32-NEXT:    kmovw %k0, %edx
 ; KNL_32-NEXT:    vmovd %edx, %xmm1
-; KNL_32-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; KNL_32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
+; KNL_32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_32-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL_32-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
-; KNL_32-NEXT:    popl %esi
-; KNL_32-NEXT:    popl %ebx
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test30:
@@ -1463,7 +1471,7 @@ define <3 x i32> @test30(<3 x i32*> %bas
 ; SKX-NEXT:    je .LBB29_2
 ; SKX-NEXT:  # BB#1: # %cond.load
 ; SKX-NEXT:    vmovq %xmm1, %rax
-; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    vmovd (%rax), %xmm0
 ; SKX-NEXT:  .LBB29_2: # %else
 ; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
@@ -1637,12 +1645,12 @@ define <16 x i64> @test_gather_16i64(<16
 ; KNL_32-LABEL: test_gather_16i64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Ltmp4:
+; KNL_32-NEXT:  .Ltmp0:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Ltmp5:
+; KNL_32-NEXT:  .Ltmp1:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Ltmp6:
+; KNL_32-NEXT:  .Ltmp2:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1760,12 +1768,12 @@ define <16 x double> @test_gather_16f64(
 ; KNL_32-LABEL: test_gather_16f64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Ltmp7:
+; KNL_32-NEXT:  .Ltmp3:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Ltmp8:
+; KNL_32-NEXT:  .Ltmp4:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Ltmp9:
+; KNL_32-NEXT:  .Ltmp5:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1877,12 +1885,12 @@ define void @test_scatter_16i64(<16 x i6
 ; KNL_32-LABEL: test_scatter_16i64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Ltmp10:
+; KNL_32-NEXT:  .Ltmp6:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Ltmp11:
+; KNL_32-NEXT:  .Ltmp7:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Ltmp12:
+; KNL_32-NEXT:  .Ltmp8:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1991,12 +1999,12 @@ define void @test_scatter_16f64(<16 x do
 ; KNL_32-LABEL: test_scatter_16f64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Ltmp13:
+; KNL_32-NEXT:  .Ltmp9:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Ltmp14:
+; KNL_32-NEXT:  .Ltmp10:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Ltmp15:
+; KNL_32-NEXT:  .Ltmp11:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp

Modified: llvm/trunk/test/CodeGen/X86/pr27591.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr27591.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr27591.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr27591.ll Wed Jul  6 04:11:49 2016
@@ -3,48 +3,39 @@ target datalayout = "e-m:e-i64:64-f80:12
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @test1(i32 %x) #0 {
-; CHECK-LABEL: test1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    kmovw %ecx, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    movzbl %al, %edi
-; CHECK-NEXT:    callq callee1
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
 entry:
   %tobool = icmp ne i32 %x, 0
   call void @callee1(i1 zeroext %tobool)
   ret void
 }
 
+; CHECK-LABEL: test1:
+; CHECK:      cmpl   $0, %edi
+; CHECK-NEXT: setne  %al
+; CHECK-NEXT: andb   $1, %al
+; CHECK-NEXT: movzbl %al, %edi
+; CHECK-NEXT: callq  callee1
+
 define void @test2(i32 %x) #0 {
-; CHECK-LABEL: test2:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    kmovw %ecx, %k0
-; CHECK-NEXT:    kmovw %k0, %ecx
-; CHECK-NEXT:    movb %cl, %al
-; CHECK-NEXT:    xorl %edi, %edi
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    movl $-1, %edx
-; CHECK-NEXT:    cmovnel %edx, %edi
-; CHECK-NEXT:    callq callee2
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
 entry:
   %tobool = icmp ne i32 %x, 0
   call void @callee2(i1 signext %tobool)
   ret void
 }
 
+; CHECK-LABEL: test2:
+; CHECK:      cmpl   $0, %edi
+; CHECK-NEXT: setne  %al
+; CHECK-NEXT: kmovb  %eax, %k0
+; CHECK-NEXT: kmovw  %k0, %edi
+; CHECK-NEXT: andl  $1, %edi
+; CHECK-NEXT: movb  %dil, %al
+; CHECK-NEXT: xorl  %edi, %edi
+; CHECK-NEXT: testb  %al, %al
+; CHECK-NEXT: movl  $-1, %ecx
+; CHECK-NEXT: cmovnel  %ecx, %edi
+; CHECK-NEXT: callq  callee2
+
 declare void @callee1(i1 zeroext)
 declare void @callee2(i1 signext)
 

Modified: llvm/trunk/test/CodeGen/X86/pr28173.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr28173.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr28173.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr28173.ll Wed Jul  6 04:11:49 2016
@@ -5,12 +5,12 @@ target triple = "x86_64-unknown-linux-gn
 ; Note that the kmovs should really *not* appear in the output, this is an
 ; artifact of the current poor lowering. This is tracked by PR28175.
 
+; CHECK-LABEL: @foo64
+; CHECK: kmov
+; CHECK: kmov
+; CHECK: orq  $-2, %rax
+; CHECK: ret
 define i64 @foo64(i1 zeroext %i, i32 %j) #0 {
-; CHECK-LABEL: foo64:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    orq $-2, %rdi
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    retq
   br label %bb
 
 bb:
@@ -22,12 +22,12 @@ end:
   ret i64 %v
 }
 
+; CHECK-LABEL: @foo16
+; CHECK: kmov
+; CHECK: kmov
+; CHECK: orl $65534, %eax
+; CHECK: retq
 define i16 @foo16(i1 zeroext %i, i32 %j) #0 {
-; CHECK-LABEL: foo16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    orl $65534, %edi # imm = 0xFFFE
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
   br label %bb
 
 bb:

Modified: llvm/trunk/test/CodeGen/X86/xaluo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/xaluo.ll?rev=274615&r1=274614&r2=274615&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/xaluo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/xaluo.ll Wed Jul  6 04:11:49 2016
@@ -738,10 +738,10 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
 ; KNL-LABEL: bug27873:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    andl $1, %esi
-; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    movl $160, %ecx
 ; KNL-NEXT:    movq %rdi, %rax
 ; KNL-NEXT:    mulq %rcx
+; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    seto %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0




More information about the llvm-commits mailing list