[llvm] r372154 - [X86] Call SimplifyDemandedVectorElts on KSHIFTL/KSHIFTR nodes during DAG combine.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 17 11:02:52 PDT 2019


Author: ctopper
Date: Tue Sep 17 11:02:52 2019
New Revision: 372154

URL: http://llvm.org/viewvc/llvm-project?rev=372154&view=rev
Log:
[X86] Call SimplifyDemandedVectorElts on KSHIFTL/KSHIFTR nodes during DAG combine.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avx512-ext.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/vec_smulo.ll
    llvm/trunk/test/CodeGen/X86/vec_umulo.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=372154&r1=372153&r2=372154&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Sep 17 11:02:52 2019
@@ -45107,6 +45107,20 @@ static SDValue combineExtInVec(SDNode *N
   return SDValue();
 }
 
+static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+
+  APInt KnownUndef, KnownZero;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+                                     KnownZero, DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -45247,6 +45261,8 @@ SDValue X86TargetLowering::PerformDAGCom
   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
   case X86ISD::PMULDQ:
   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
+  case X86ISD::KSHIFTL:
+  case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
   }
 
   return SDValue();

Modified: llvm/trunk/test/CodeGen/X86/avx512-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-ext.ll?rev=372154&r1=372153&r2=372154&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-ext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-ext.ll Tue Sep 17 11:02:52 2019
@@ -1886,332 +1886,311 @@ define void @extload_v8i64(<8 x i8>* %a,
 define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: test21:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k2
-; KNL-NEXT:    kxorw %k0, %k2, %k0
+; KNL-NEXT:    kmovw %edx, %k1
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
 ; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k1, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k1
-; KNL-NEXT:    kmovw %edx, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kxorw %k0, %k2, %k2
+; KNL-NEXT:    kshiftrw $2, %k2, %k3
+; KNL-NEXT:    kxorw %k1, %k3, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $13, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k1
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $12, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k1
-; KNL-NEXT:    kmovw %r8d, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $11, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k1
-; KNL-NEXT:    kmovw %r9d, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $10, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $9, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $8, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $7, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $6, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $5, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $4, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $3, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $2, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    korw %k1, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftrw $1, %k2, %k3
-; KNL-NEXT:    kxorw %k0, %k3, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
+; KNL-NEXT:    kxorw %k1, %k2, %k1
+; KNL-NEXT:    kshiftrw $3, %k1, %k2
+; KNL-NEXT:    kmovw %ecx, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $4, %k1, %k2
+; KNL-NEXT:    kmovw %r8d, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $5, %k1, %k2
+; KNL-NEXT:    kmovw %r9d, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $6, %k1, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $7, %k1, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $8, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $8, %k1, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $7, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $9, %k1, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $6, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $10, %k1, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $5, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $11, %k1, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $4, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $12, %k1, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $3, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $13, %k1, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $2, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $1, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftlw $1, %k1, %k1
+; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    korw %k2, %k0, %k2
+; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kshiftrw $1, %k3, %k4
-; KNL-NEXT:    kxorw %k0, %k4, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k3, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $13, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k3
+; KNL-NEXT:    kxorw %k0, %k3, %k3
+; KNL-NEXT:    kshiftrw $2, %k3, %k4
+; KNL-NEXT:    kxorw %k2, %k4, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $13, %k2, %k2
+; KNL-NEXT:    kxorw %k2, %k3, %k2
+; KNL-NEXT:    kshiftrw $3, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $12, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $4, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $11, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $5, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $10, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $6, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $9, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $7, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $8, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $8, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $7, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $9, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $6, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $10, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $5, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $11, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $4, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $12, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $3, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $13, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $2, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k3
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftrw $14, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $1, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftlw $1, %k2, %k2
+; KNL-NEXT:    kshiftrw $1, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    korw %k3, %k0, %k3
+; KNL-NEXT:    korw %k3, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kshiftrw $1, %k4, %k5
-; KNL-NEXT:    kxorw %k0, %k5, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k4, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k4
+; KNL-NEXT:    kxorw %k0, %k4, %k4
+; KNL-NEXT:    kshiftrw $2, %k4, %k5
+; KNL-NEXT:    kxorw %k3, %k5, %k3
+; KNL-NEXT:    kshiftlw $15, %k3, %k3
+; KNL-NEXT:    kshiftrw $13, %k3, %k3
+; KNL-NEXT:    kxorw %k3, %k4, %k3
+; KNL-NEXT:    kshiftrw $3, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $12, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $4, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $11, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $5, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $10, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $6, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $9, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $7, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $8, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $8, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $7, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $9, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $6, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $10, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $5, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $11, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $4, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $12, %k3, %k4
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k5
 ; KNL-NEXT:    kxorw %k5, %k4, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $3, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $13, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $2, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftrw $14, %k3, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $1, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftlw $1, %k3, %k3
+; KNL-NEXT:    kshiftrw $1, %k3, %k3
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    korw %k4, %k3, %k3
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k0, %k5, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k5
+; KNL-NEXT:    kxorw %k4, %k5, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    kshiftrw $13, %k4, %k4
 ; KNL-NEXT:    kxorw %k4, %k0, %k0
 ; KNL-NEXT:    kshiftrw $3, %k0, %k4
@@ -2329,228 +2308,113 @@ define <64 x i16> @test21(<64 x i16> %x
 ;
 ; AVX512DQNOBW-LABEL: test21:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    kmovw %esi, %k0
-; AVX512DQNOBW-NEXT:    kmovw %edi, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k0, %k2, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512DQNOBW-NEXT:    kxorw %k0, %k1, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512DQNOBW-NEXT:    kmovw %edx, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512DQNOBW-NEXT:    kmovw %ecx, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512DQNOBW-NEXT:    kmovw %r8d, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512DQNOBW-NEXT:    kmovw %r9d, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k0, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k0, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k0, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k3, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kmovw %edx, %k0
+; AVX512DQNOBW-NEXT:    kmovw %edi, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k1
 ; AVX512DQNOBW-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k2, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
+; AVX512DQNOBW-NEXT:    kxorw %k1, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512DQNOBW-NEXT:    kxorw %k0, %k3, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512DQNOBW-NEXT:    kxorw %k0, %k2, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %ecx, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $12, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %r8d, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $11, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %r9d, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $10, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $9, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $8, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $7, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $6, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $5, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $4, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $2, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k1, %k2
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $1, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $1, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k1
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k1, %k3, %k3
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k3, %k4
 ; AVX512DQNOBW-NEXT:    kxorw %k2, %k4, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kxorw %k2, %k3, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kshiftrw $3, %k2, %k3
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
@@ -2645,18 +2509,12 @@ define <64 x i16> @test21(<64 x i16> %x
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k4, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k1, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k4, %k5
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k5, %k3
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k3, %k3
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k3, %k3
 ; AVX512DQNOBW-NEXT:    kxorw %k3, %k4, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
 ; AVX512DQNOBW-NEXT:    kshiftrw $3, %k3, %k4
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
@@ -2747,13 +2605,113 @@ define <64 x i16> @test21(<64 x i16> %x
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
 ; AVX512DQNOBW-NEXT:    korw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    vpmovm2d %k3, %zmm4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k1, %k5, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k1, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k5, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $1, %k4, %k4
+; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    korw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    vpmovm2d %k1, %zmm4
 ; AVX512DQNOBW-NEXT:    vpmovdw %zmm4, %ymm4
 ; AVX512DQNOBW-NEXT:    vpand %ymm1, %ymm4, %ymm1
-; AVX512DQNOBW-NEXT:    vpmovm2d %k2, %zmm4
+; AVX512DQNOBW-NEXT:    vpmovm2d %k3, %zmm4
 ; AVX512DQNOBW-NEXT:    vpmovdw %zmm4, %ymm4
 ; AVX512DQNOBW-NEXT:    vpand %ymm2, %ymm4, %ymm2
-; AVX512DQNOBW-NEXT:    vpmovm2d %k1, %zmm4
+; AVX512DQNOBW-NEXT:    vpmovm2d %k2, %zmm4
 ; AVX512DQNOBW-NEXT:    vpmovdw %zmm4, %ymm4
 ; AVX512DQNOBW-NEXT:    vpand %ymm3, %ymm4, %ymm3
 ; AVX512DQNOBW-NEXT:    vpmovm2d %k0, %zmm4

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=372154&r1=372153&r2=372154&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Tue Sep 17 11:02:52 2019
@@ -2753,229 +2753,114 @@ define void @store_64i1(<64 x i1>* %a, <
 ;
 ; KNL-LABEL: store_64i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    kmovw %esi, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k2
-; KNL-NEXT:    kxorw %k0, %k2, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k1, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k1
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $13, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k1
-; KNL-NEXT:    kmovw %r8d, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $12, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k1
-; KNL-NEXT:    kmovw %r9d, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $11, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $10, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $9, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $8, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $7, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $6, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $5, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $4, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $3, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $2, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftrw $1, %k2, %k3
-; KNL-NEXT:    kxorw %k1, %k3, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kmovw %ecx, %k0
+; KNL-NEXT:    kmovw %esi, %k2
+; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $14, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k2, %k1
-; KNL-NEXT:    kshiftrw $2, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $3, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
+; KNL-NEXT:    kxorw %k1, %k2, %k2
+; KNL-NEXT:    kshiftrw $2, %k2, %k3
+; KNL-NEXT:    kxorw %k0, %k3, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    kxorw %k0, %k2, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k2
+; KNL-NEXT:    kmovw %r8d, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $4, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k2
+; KNL-NEXT:    kmovw %r9d, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $5, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $6, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $7, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $8, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $8, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $7, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $9, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $6, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $10, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $5, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $11, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $4, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $12, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $3, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $13, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $2, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $14, %k1, %k2
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $1, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k1
+; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kshiftrw $1, %k3, %k4
+; KNL-NEXT:    kxorw %k1, %k3, %k3
+; KNL-NEXT:    kshiftrw $2, %k3, %k4
 ; KNL-NEXT:    kxorw %k2, %k4, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $14, %k2, %k2
+; KNL-NEXT:    kshiftrw $13, %k2, %k2
 ; KNL-NEXT:    kxorw %k2, %k3, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $13, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
 ; KNL-NEXT:    kshiftrw $3, %k2, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
@@ -3070,18 +2955,12 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kshiftrw $1, %k4, %k5
+; KNL-NEXT:    kxorw %k1, %k4, %k4
+; KNL-NEXT:    kshiftrw $2, %k4, %k5
 ; KNL-NEXT:    kxorw %k3, %k5, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $14, %k3, %k3
+; KNL-NEXT:    kshiftrw $13, %k3, %k3
 ; KNL-NEXT:    kxorw %k3, %k4, %k3
-; KNL-NEXT:    kshiftrw $2, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $13, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
 ; KNL-NEXT:    kshiftrw $3, %k3, %k4
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k5
@@ -3172,9 +3051,109 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    korw %k4, %k3, %k3
-; KNL-NEXT:    kmovw %k3, 6(%rdi)
-; KNL-NEXT:    kmovw %k2, 4(%rdi)
-; KNL-NEXT:    kmovw %k1, 2(%rdi)
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k1, %k5, %k1
+; KNL-NEXT:    kshiftrw $2, %k1, %k5
+; KNL-NEXT:    kxorw %k4, %k5, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $13, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $3, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $12, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $4, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $11, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $5, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $10, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $6, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $9, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $7, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $8, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $8, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $7, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $9, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $6, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $10, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $5, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $11, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $4, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $12, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $3, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $13, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $2, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $1, %k4, %k4
+; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kshiftlw $1, %k1, %k1
+; KNL-NEXT:    kshiftrw $1, %k1, %k1
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    korw %k4, %k1, %k1
+; KNL-NEXT:    kmovw %k1, 6(%rdi)
+; KNL-NEXT:    kmovw %k3, 4(%rdi)
+; KNL-NEXT:    kmovw %k2, 2(%rdi)
 ; KNL-NEXT:    kmovw %k0, (%rdi)
 ; KNL-NEXT:    retq
 ;
@@ -3196,229 +3175,114 @@ define void @store_64i1(<64 x i1>* %a, <
 ;
 ; AVX512DQ-LABEL: store_64i1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    kmovw %edx, %k0
-; AVX512DQ-NEXT:    kmovw %esi, %k1
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512DQ-NEXT:    kxorw %k0, %k2, %k0
-; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512DQ-NEXT:    kxorw %k0, %k1, %k0
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %ecx, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %r8d, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %r9d, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512DQ-NEXT:    kxorw %k1, %k3, %k1
-; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %ecx, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
 ; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k2, %k1
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
+; AVX512DQ-NEXT:    kxorw %k1, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512DQ-NEXT:    kxorw %k0, %k3, %k0
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512DQ-NEXT:    kxorw %k0, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %r8d, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %r9d, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k2
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $1, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k1, %k1
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512DQ-NEXT:    kxorw %k1, %k3, %k3
+; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
 ; AVX512DQ-NEXT:    kxorw %k2, %k4, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k2
 ; AVX512DQ-NEXT:    kxorw %k2, %k3, %k2
-; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k3
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k4
@@ -3513,18 +3377,12 @@ define void @store_64i1(<64 x i1>* %a, <
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kshiftrw $1, %k4, %k5
+; AVX512DQ-NEXT:    kxorw %k1, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $2, %k4, %k5
 ; AVX512DQ-NEXT:    kxorw %k3, %k5, %k3
 ; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k3
+; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k3
 ; AVX512DQ-NEXT:    kxorw %k3, %k4, %k3
-; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $13, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
 ; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k5
@@ -3615,9 +3473,109 @@ define void @store_64i1(<64 x i1>* %a, <
 ; AVX512DQ-NEXT:    kmovw %eax, %k4
 ; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
 ; AVX512DQ-NEXT:    korw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kmovw %k3, 6(%rdi)
-; AVX512DQ-NEXT:    kmovw %k2, 4(%rdi)
-; AVX512DQ-NEXT:    kmovw %k1, 2(%rdi)
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k1, %k5, %k1
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k5
+; AVX512DQ-NEXT:    kxorw %k4, %k5, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $13, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $12, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $11, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $9, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $8, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $7, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $6, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $5, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $4, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $3, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $2, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $1, %k4, %k4
+; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    korw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, 6(%rdi)
+; AVX512DQ-NEXT:    kmovw %k3, 4(%rdi)
+; AVX512DQ-NEXT:    kmovw %k2, 2(%rdi)
 ; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
 ; AVX512DQ-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/vec_smulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_smulo.ll?rev=372154&r1=372153&r2=372154&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_smulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_smulo.ll Tue Sep 17 11:02:52 2019
@@ -1730,24 +1730,20 @@ define <2 x i32> @smulo_v2i64(<2 x i64>
 ;
 ; AVX512-LABEL: smulo_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    vmovq %xmm1, %rdx
-; AVX512-NEXT:    vmovq %xmm0, %rsi
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
 ; AVX512-NEXT:    imulq %rdx, %rsi
-; AVX512-NEXT:    seto %dl
+; AVX512-NEXT:    vmovq %rsi, %xmm0
 ; AVX512-NEXT:    imulq %rax, %rcx
-; AVX512-NEXT:    vmovq %rcx, %xmm0
-; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vmovq %rcx, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kmovd %edx, %k1
-; AVX512-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512-NEXT:    kxorw %k0, %k2, %k0
-; AVX512-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512-NEXT:    kxorw %k0, %k1, %k1
+; AVX512-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
@@ -2201,73 +2197,46 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a
 ;
 ; AVX512-LABEL: smulo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k0
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; AVX512-NEXT:    kshiftrw $3, %k0, %k1
 ; AVX512-NEXT:    kmovd %k1, %r9d
 ; AVX512-NEXT:    andb $1, %r9b
 ; AVX512-NEXT:    negb %r9b
-; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0
 ; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    kshiftrw $3, %k1, %k2
 ; AVX512-NEXT:    kmovd %k2, %r10d
 ; AVX512-NEXT:    andb $1, %r10b
 ; AVX512-NEXT:    negb %r10b
 ; AVX512-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512-NEXT:    kmovd %k2, %r11d
-; AVX512-NEXT:    andb $1, %r11b
-; AVX512-NEXT:    negb %r11b
-; AVX512-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512-NEXT:    kmovd %k2, %ebx
-; AVX512-NEXT:    andb $1, %bl
-; AVX512-NEXT:    negb %bl
-; AVX512-NEXT:    kshiftrw $1, %k0, %k2
-; AVX512-NEXT:    kmovd %k2, %esi
+; AVX512-NEXT:    kmovd %k1, %ecx
+; AVX512-NEXT:    andb $1, %cl
+; AVX512-NEXT:    negb %cl
+; AVX512-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512-NEXT:    kmovd %k0, %esi
 ; AVX512-NEXT:    andb $1, %sil
 ; AVX512-NEXT:    negb %sil
-; AVX512-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512-NEXT:    kmovd %k2, %edx
-; AVX512-NEXT:    andb $1, %dl
-; AVX512-NEXT:    negb %dl
 ; AVX512-NEXT:    kmovd %k1, %eax
 ; AVX512-NEXT:    andb $1, %al
 ; AVX512-NEXT:    negb %al
-; AVX512-NEXT:    kmovd %k0, %ecx
-; AVX512-NEXT:    andb $1, %cl
-; AVX512-NEXT:    negb %cl
+; AVX512-NEXT:    kmovd %k2, %edx
+; AVX512-NEXT:    andb $1, %dl
+; AVX512-NEXT:    negb %dl
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    imulb %cl
+; AVX512-NEXT:    imulb %dl
 ; AVX512-NEXT:    movl %eax, %r8d
 ; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    movl %r8d, %ecx
-; AVX512-NEXT:    andb $1, %cl
-; AVX512-NEXT:    negb %cl
-; AVX512-NEXT:    cmpb %r8b, %cl
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    orb %al, %cl
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512-NEXT:    movl %edx, %eax
-; AVX512-NEXT:    imulb %sil
-; AVX512-NEXT:    movl %eax, %edx
-; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    movl %edx, %ecx
-; AVX512-NEXT:    andb $1, %cl
-; AVX512-NEXT:    negb %cl
-; AVX512-NEXT:    cmpb %dl, %cl
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    orb %al, %cl
+; AVX512-NEXT:    movl %r8d, %edx
+; AVX512-NEXT:    andb $1, %dl
+; AVX512-NEXT:    negb %dl
+; AVX512-NEXT:    cmpb %r8b, %dl
+; AVX512-NEXT:    setne %dl
+; AVX512-NEXT:    orb %al, %dl
 ; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512-NEXT:    movl %r11d, %eax
-; AVX512-NEXT:    imulb %bl
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    movl %esi, %eax
+; AVX512-NEXT:    imulb %cl
 ; AVX512-NEXT:    movl %eax, %esi
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    movl %esi, %ecx
@@ -2278,38 +2247,37 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512-NEXT:    kxorw %k0, %k2, %k2
+; AVX512-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512-NEXT:    kxorw %k1, %k3, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512-NEXT:    kshiftrw $13, %k0, %k0
-; AVX512-NEXT:    movl %r10d, %eax
-; AVX512-NEXT:    imulb %r9b
+; AVX512-NEXT:    kxorw %k1, %k2, %k1
+; AVX512-NEXT:    kshiftlw $13, %k1, %k1
+; AVX512-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512-NEXT:    movl %r9d, %eax
+; AVX512-NEXT:    imulb %r10b
 ; AVX512-NEXT:    # kill: def $al killed $al def $eax
 ; AVX512-NEXT:    seto %cl
-; AVX512-NEXT:    movl %eax, %ebx
-; AVX512-NEXT:    andb $1, %bl
-; AVX512-NEXT:    negb %bl
-; AVX512-NEXT:    cmpb %al, %bl
-; AVX512-NEXT:    setne %bl
-; AVX512-NEXT:    orb %cl, %bl
+; AVX512-NEXT:    movl %eax, %edx
+; AVX512-NEXT:    andb $1, %dl
+; AVX512-NEXT:    negb %dl
+; AVX512-NEXT:    cmpb %al, %dl
+; AVX512-NEXT:    setne %dl
+; AVX512-NEXT:    orb %cl, %dl
 ; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    kmovd %ecx, %k1
-; AVX512-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512-NEXT:    korw %k1, %k0, %k1
+; AVX512-NEXT:    kmovd %ecx, %k2
+; AVX512-NEXT:    kshiftlw $3, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kmovd %r8d, %k0
-; AVX512-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512-NEXT:    kmovd %edx, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512-NEXT:    kmovd %r8d, %k1
 ; AVX512-NEXT:    kmovd %esi, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kxorw %k0, %k2, %k0
+; AVX512-NEXT:    kshiftrw $2, %k0, %k2
+; AVX512-NEXT:    kxorw %k1, %k2, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $13, %k1, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k0
@@ -2321,7 +2289,6 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    retq
   %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
   %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0

Modified: llvm/trunk/test/CodeGen/X86/vec_umulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_umulo.ll?rev=372154&r1=372153&r2=372154&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_umulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_umulo.ll Tue Sep 17 11:02:52 2019
@@ -1532,26 +1532,21 @@ define <2 x i32> @umulo_v2i64(<2 x i64>
 ;
 ; AVX512-LABEL: umulo_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    vpextrq $1, %xmm1, %r8
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vmovq %xmm1, %rdx
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
 ; AVX512-NEXT:    mulq %rdx
-; AVX512-NEXT:    movq %rax, %rsi
-; AVX512-NEXT:    seto %r9b
-; AVX512-NEXT:    movq %rcx, %rax
-; AVX512-NEXT:    mulq %r8
 ; AVX512-NEXT:    vmovq %rax, %xmm0
-; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    movq %rcx, %rax
+; AVX512-NEXT:    mulq %rsi
+; AVX512-NEXT:    vmovq %rax, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kmovd %r9d, %k1
-; AVX512-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512-NEXT:    kxorw %k0, %k2, %k0
-; AVX512-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512-NEXT:    kxorw %k0, %k1, %k1
+; AVX512-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
@@ -1950,7 +1945,6 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a
 ;
 ; AVX512-LABEL: umulo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbx
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; AVX512-NEXT:    kshiftrw $3, %k0, %k1
@@ -1962,47 +1956,26 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    kmovd %k2, %r10d
 ; AVX512-NEXT:    andb $1, %r10b
 ; AVX512-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512-NEXT:    kmovd %k2, %r11d
-; AVX512-NEXT:    andb $1, %r11b
-; AVX512-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512-NEXT:    kmovd %k2, %ebx
-; AVX512-NEXT:    andb $1, %bl
-; AVX512-NEXT:    kshiftrw $1, %k0, %k2
-; AVX512-NEXT:    kmovd %k2, %edx
-; AVX512-NEXT:    andb $1, %dl
-; AVX512-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512-NEXT:    kmovd %k2, %esi
+; AVX512-NEXT:    kmovd %k0, %esi
 ; AVX512-NEXT:    andb $1, %sil
-; AVX512-NEXT:    kmovd %k0, %eax
-; AVX512-NEXT:    andb $1, %al
+; AVX512-NEXT:    kshiftrw $2, %k1, %k0
 ; AVX512-NEXT:    kmovd %k1, %ecx
 ; AVX512-NEXT:    andb $1, %cl
+; AVX512-NEXT:    kmovd %k2, %eax
+; AVX512-NEXT:    andb $1, %al
+; AVX512-NEXT:    kmovd %k0, %edx
+; AVX512-NEXT:    andb $1, %dl
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    mulb %cl
+; AVX512-NEXT:    mulb %dl
 ; AVX512-NEXT:    movl %eax, %r8d
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    testb $-2, %r8b
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    orb %al, %cl
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512-NEXT:    movl %edx, %eax
-; AVX512-NEXT:    mulb %sil
-; AVX512-NEXT:    movl %eax, %edx
-; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    testb $-2, %dl
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    orb %al, %cl
+; AVX512-NEXT:    setne %dl
+; AVX512-NEXT:    orb %al, %dl
 ; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512-NEXT:    movl %r11d, %eax
-; AVX512-NEXT:    mulb %bl
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    movl %esi, %eax
+; AVX512-NEXT:    mulb %cl
 ; AVX512-NEXT:    movl %eax, %esi
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    testb $-2, %sil
@@ -2010,35 +1983,34 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512-NEXT:    kxorw %k0, %k2, %k2
+; AVX512-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512-NEXT:    kxorw %k1, %k3, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512-NEXT:    kxorw %k1, %k2, %k1
+; AVX512-NEXT:    kshiftlw $13, %k1, %k1
+; AVX512-NEXT:    kshiftrw $13, %k1, %k1
 ; AVX512-NEXT:    movl %r9d, %eax
 ; AVX512-NEXT:    mulb %r10b
 ; AVX512-NEXT:    # kill: def $al killed $al def $eax
 ; AVX512-NEXT:    seto %cl
 ; AVX512-NEXT:    testb $-2, %al
-; AVX512-NEXT:    setne %bl
-; AVX512-NEXT:    orb %cl, %bl
+; AVX512-NEXT:    setne %dl
+; AVX512-NEXT:    orb %cl, %dl
 ; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    kmovd %ecx, %k1
-; AVX512-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512-NEXT:    korw %k1, %k0, %k1
+; AVX512-NEXT:    kmovd %ecx, %k2
+; AVX512-NEXT:    kshiftlw $3, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kmovd %r8d, %k0
-; AVX512-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512-NEXT:    kmovd %edx, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512-NEXT:    kmovd %r8d, %k1
 ; AVX512-NEXT:    kmovd %esi, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kxorw %k0, %k2, %k0
+; AVX512-NEXT:    kshiftrw $2, %k0, %k2
+; AVX512-NEXT:    kxorw %k1, %k2, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $13, %k1, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k0
@@ -2050,7 +2022,6 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    retq
   %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
   %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0




More information about the llvm-commits mailing list