[llvm] r373495 - [X86] Rewrite to the vXi1 subvector insertion code to not rely on the value of bits that might be undef

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 2 10:47:09 PDT 2019


Author: ctopper
Date: Wed Oct  2 10:47:09 2019
New Revision: 373495

URL: http://llvm.org/viewvc/llvm-project?rev=373495&view=rev
Log:
[X86] Rewrite to the vXi1 subvector insertion code to not rely on the value of bits that might be undef

The previous code tried to do a trick where we would extract the subvector from the location we were inserting. Then xor that with the new value. Take the xored value and clear out the bits above the subvector size. Then shift that xored subvector to the insert location. And finally xor that with the original vector. Since the old subvector was used in both xors, this would leave just the new subvector at the inserted location. Since the surrounding bits had been zeroed no other bits of the original vector would be modified.

Unfortunately, if the old subvector came from undef we might aggressively propagate the undef. Then we end up with the XORs not cancelling because they aren't using the same value for the two uses of the old subvector. @bkramer gave me a case that demonstrated this, but we haven't reduced it enough to make it easily readable to see what's happening.

This patch uses a safer, but more costly approach. It isolate the bits above the insertion and bits below the insert point and ORs those together leaving 0 for the insertion location. Then widens the subvector with 0s in the upper bits, shifts it into position with 0s in the lower bits. Then we do another OR.

Differential Revision: https://reviews.llvm.org/D68311

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll
    llvm/trunk/test/CodeGen/X86/avx512-ext.ll
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/masked_store.ll
    llvm/trunk/test/CodeGen/X86/vec_smulo.ll
    llvm/trunk/test/CodeGen/X86/vec_umulo.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=373495&r1=373494&r2=373495&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Oct  2 10:47:09 2019
@@ -5769,23 +5769,35 @@ static SDValue insert1BitVector(SDValue
 
   // Widen the vector if needed.
   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
-  // Move the current value of the bit to be replace to the lsbs.
-  Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
-                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
-  // Xor with the new bit.
-  Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
-  // Shift to MSB, filling bottom bits with 0.
+
+  // Clear the upper bits of the subvector and move it to its insert position.
   unsigned ShiftLeft = NumElems - SubVecNumElems;
-  Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
-                   DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
-  // Shift to the final position, filling upper bits with 0.
+  SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+                       DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
-  Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
-                   DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
-  // Xor with original vector leaving the new value.
-  Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
+  SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+                       DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+
+  // Isolate the bits below the insertion point.
+  unsigned LowShift = NumElems - IdxVal;
+  SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
+                            DAG.getTargetConstant(LowShift, dl, MVT::i8));
+  Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
+                    DAG.getTargetConstant(LowShift, dl, MVT::i8));
+
+  // Isolate the bits after the last inserted bit.
+  unsigned HighShift = IdxVal + SubVecNumElems;
+  SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
+                            DAG.getTargetConstant(HighShift, dl, MVT::i8));
+  High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
+                    DAG.getTargetConstant(HighShift, dl, MVT::i8));
+
+  // Now OR all 3 pieces together.
+  Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
+  SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
+
   // Reduce to original width if needed.
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
 }
 
 static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

Modified: llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll?rev=373495&r1=373494&r2=373495&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll Wed Oct  2 10:47:09 2019
@@ -531,211 +531,256 @@ define <17 x i1> @test16(<17 x i1> %a, <
 ; KNL-NEXT:    pushq %r12
 ; KNL-NEXT:    pushq %rbx
 ; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k2
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k3
-; KNL-NEXT:    kxorw %k1, %k3, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $13, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k2, %k1
-; KNL-NEXT:    kshiftrw $3, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $4, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $5, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $6, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $7, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $8, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $8, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $7, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $9, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $6, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $10, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $5, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $11, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $4, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $12, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $3, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $13, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $14, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $14, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kmovw %esi, %k3
-; KNL-NEXT:    kxorw %k0, %k3, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k3
-; KNL-NEXT:    kxorw %k2, %k3, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    kshiftlw $4, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $3, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $4, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $5, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $6, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $9, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $7, %k1, %k1
+; KNL-NEXT:    kshiftlw $8, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $8, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $8, %k1, %k1
+; KNL-NEXT:    kshiftlw $9, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $9, %k1, %k1
+; KNL-NEXT:    kshiftlw $10, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $10, %k1, %k1
+; KNL-NEXT:    kshiftlw $11, %k0, %k6
+; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $11, %k1, %k1
+; KNL-NEXT:    kshiftlw $12, %k0, %k5
+; KNL-NEXT:    korw %k1, %k5, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $4, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $12, %k1, %k1
+; KNL-NEXT:    kshiftlw $13, %k0, %k4
+; KNL-NEXT:    korw %k1, %k4, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k0, %k0
 ; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    kmovw %r8d, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    kmovw %r9d, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $8, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $13, %k1, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k3
+; KNL-NEXT:    korw %k0, %k3, %k0
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k2
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k1
+; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $7, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kmovw %edx, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kmovw %esi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $6, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kmovw %ecx, %k2
+; KNL-NEXT:    kshiftlw $2, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %r8d, %k2
+; KNL-NEXT:    kshiftlw $3, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %r9d, %k2
+; KNL-NEXT:    kshiftlw $4, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $5, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
+; KNL-NEXT:    kshiftlw $5, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $4, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
+; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $3, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
+; KNL-NEXT:    kshiftlw $7, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    kshiftlw $9, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
+; KNL-NEXT:    kshiftlw $8, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    kshiftlw $8, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $14, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kmovw %ecx, %k2
+; KNL-NEXT:    kshiftlw $9, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    kshiftlw $7, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftlw $10, %k2, %k2
+; KNL-NEXT:    korw %k2, %k6, %k2
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
 ; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k2
+; KNL-NEXT:    kshiftlw $11, %k2, %k2
+; KNL-NEXT:    korw %k2, %k5, %k2
 ; KNL-NEXT:    xorl %ecx, %ecx
 ; KNL-NEXT:    testb $1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    movl $65535, %edx ## imm = 0xFFFF
 ; KNL-NEXT:    movl $0, %esi
 ; KNL-NEXT:    cmovnel %edx, %esi
-; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kshiftlw $5, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kshiftlw $12, %k2, %k2
+; KNL-NEXT:    korw %k2, %k4, %k2
 ; KNL-NEXT:    testb $1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    kshiftlw $4, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kshiftlw $13, %k2, %k2
+; KNL-NEXT:    korw %k2, %k3, %k2
 ; KNL-NEXT:    cmovnel %edx, %ecx
+; KNL-NEXT:    kshiftlw $3, %k0, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; KNL-NEXT:    kmovw %edx, %k2
+; KNL-NEXT:    kshiftlw $14, %k2, %k2
+; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    kshiftlw $2, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; KNL-NEXT:    kmovw %edx, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kandw %k1, %k2, %k1
 ; KNL-NEXT:    kmovw %k1, %r8d
@@ -832,193 +877,294 @@ define <17 x i1> @test16(<17 x i1> %a, <
 ; SKX-NEXT:    pushq %r13
 ; SKX-NEXT:    pushq %r12
 ; SKX-NEXT:    pushq %rbx
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
 ; SKX-NEXT:    movq %rdi, %rax
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftld $31, %k0, %k0
-; SKX-NEXT:    kshiftrd $30, %k0, %k0
-; SKX-NEXT:    kxord %k0, %k2, %k2
-; SKX-NEXT:    kshiftrd $2, %k2, %k3
-; SKX-NEXT:    kxord %k1, %k3, %k1
-; SKX-NEXT:    kshiftld $31, %k1, %k1
-; SKX-NEXT:    kshiftrd $29, %k1, %k1
-; SKX-NEXT:    kxord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $31, %k0, %k1
+; SKX-NEXT:    kshiftld $2, %k0, %k0
+; SKX-NEXT:    kord %k0, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftld $31, %k2, %k2
+; SKX-NEXT:    kshiftrd $30, %k2, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
 ; SKX-NEXT:    kshiftrd $3, %k1, %k2
+; SKX-NEXT:    kshiftld $3, %k2, %k2
+; SKX-NEXT:    kshiftld $30, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
+; SKX-NEXT:    kshiftrd $30, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kshiftld $31, %k3, %k2
+; SKX-NEXT:    kshiftrd $29, %k2, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $4, %k1, %k2
+; SKX-NEXT:    kshiftld $4, %k2, %k2
+; SKX-NEXT:    kshiftld $29, %k1, %k1
+; SKX-NEXT:    kshiftrd $29, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kshiftld $31, %k3, %k2
 ; SKX-NEXT:    kshiftrd $28, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $5, %k1, %k2
+; SKX-NEXT:    kshiftld $5, %k2, %k2
+; SKX-NEXT:    kshiftld $28, %k1, %k1
+; SKX-NEXT:    kshiftrd $28, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftrd $4, %k1, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $27, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
-; SKX-NEXT:    kshiftrd $5, %k1, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $6, %k1, %k2
+; SKX-NEXT:    kshiftld $6, %k2, %k2
+; SKX-NEXT:    kshiftld $27, %k1, %k1
+; SKX-NEXT:    kshiftrd $27, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $26, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
-; SKX-NEXT:    kshiftrd $6, %k1, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $7, %k1, %k2
+; SKX-NEXT:    kshiftld $7, %k2, %k2
+; SKX-NEXT:    kshiftld $26, %k1, %k1
+; SKX-NEXT:    kshiftrd $26, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $25, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $8, %k1, %k2
+; SKX-NEXT:    kshiftld $8, %k2, %k2
+; SKX-NEXT:    kshiftld $25, %k1, %k1
+; SKX-NEXT:    kshiftrd $25, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftrd $7, %k1, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $24, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
-; SKX-NEXT:    kshiftrd $8, %k1, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $9, %k1, %k2
+; SKX-NEXT:    kshiftld $9, %k2, %k2
+; SKX-NEXT:    kshiftld $24, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
+; SKX-NEXT:    kshiftrd $24, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kshiftld $31, %k3, %k2
 ; SKX-NEXT:    kshiftrd $23, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
-; SKX-NEXT:    kshiftrd $9, %k1, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $10, %k1, %k2
+; SKX-NEXT:    kshiftld $10, %k2, %k2
+; SKX-NEXT:    kshiftld $23, %k1, %k1
+; SKX-NEXT:    kshiftrd $23, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kshiftld $31, %k3, %k2
 ; SKX-NEXT:    kshiftrd $22, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $11, %k1, %k2
+; SKX-NEXT:    kshiftld $11, %k2, %k2
+; SKX-NEXT:    kshiftld $22, %k1, %k1
+; SKX-NEXT:    kshiftrd $22, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftrd $10, %k1, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $21, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
-; SKX-NEXT:    kshiftrd $11, %k1, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $12, %k1, %k2
+; SKX-NEXT:    kshiftld $12, %k2, %k2
+; SKX-NEXT:    kshiftld $21, %k1, %k1
+; SKX-NEXT:    kshiftrd $21, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $20, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
-; SKX-NEXT:    kshiftrd $12, %k1, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $13, %k1, %k2
+; SKX-NEXT:    kshiftld $13, %k2, %k2
+; SKX-NEXT:    kshiftld $20, %k1, %k1
+; SKX-NEXT:    kshiftrd $20, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $19, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $14, %k1, %k2
+; SKX-NEXT:    kshiftld $14, %k2, %k2
+; SKX-NEXT:    kshiftld $19, %k1, %k1
+; SKX-NEXT:    kshiftrd $19, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftrd $13, %k1, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $18, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
-; SKX-NEXT:    kshiftrd $14, %k1, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $15, %k1, %k2
+; SKX-NEXT:    kshiftld $15, %k2, %k2
+; SKX-NEXT:    kshiftld $18, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
+; SKX-NEXT:    kshiftrd $18, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kshiftld $31, %k3, %k2
 ; SKX-NEXT:    kshiftrd $17, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
-; SKX-NEXT:    kshiftrd $15, %k1, %k2
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $16, %k1, %k2
+; SKX-NEXT:    kshiftld $16, %k2, %k2
+; SKX-NEXT:    kshiftld $17, %k1, %k1
+; SKX-NEXT:    kshiftrd $17, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kshiftld $31, %k3, %k2
 ; SKX-NEXT:    kshiftrd $16, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kshiftrd $17, %k1, %k2
+; SKX-NEXT:    kshiftld $17, %k2, %k2
+; SKX-NEXT:    kshiftld $16, %k1, %k1
+; SKX-NEXT:    kshiftrd $16, %k1, %k1
+; SKX-NEXT:    kord %k2, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftrd $16, %k1, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $15, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
+; SKX-NEXT:    kord %k1, %k2, %k1
+; SKX-NEXT:    kmovd %esi, %k2
+; SKX-NEXT:    kshiftld $31, %k2, %k2
+; SKX-NEXT:    kshiftrd $31, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kmovd %edx, %k2
+; SKX-NEXT:    kshiftld $31, %k2, %k2
+; SKX-NEXT:    kshiftrd $30, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kshiftrd $3, %k0, %k2
+; SKX-NEXT:    kshiftld $3, %k2, %k2
+; SKX-NEXT:    kshiftld $30, %k0, %k0
+; SKX-NEXT:    kshiftrd $30, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
 ; SKX-NEXT:    kmovd %ecx, %k2
-; SKX-NEXT:    kmovd %esi, %k3
-; SKX-NEXT:    kxord %k0, %k3, %k0
-; SKX-NEXT:    kshiftrd $2, %k0, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $29, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
-; SKX-NEXT:    kshiftrd $3, %k0, %k2
-; SKX-NEXT:    kmovd %r8d, %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kshiftrd $4, %k0, %k2
+; SKX-NEXT:    kshiftld $4, %k2, %k2
+; SKX-NEXT:    kshiftld $29, %k0, %k0
+; SKX-NEXT:    kshiftrd $29, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kmovd %r8d, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $28, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
-; SKX-NEXT:    kshiftrd $4, %k0, %k2
-; SKX-NEXT:    kmovd %r9d, %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kshiftrd $5, %k0, %k2
+; SKX-NEXT:    kshiftld $5, %k2, %k2
+; SKX-NEXT:    kshiftld $28, %k0, %k0
+; SKX-NEXT:    kshiftrd $28, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kmovd %r9d, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $27, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kshiftrd $6, %k0, %k2
+; SKX-NEXT:    kshiftld $6, %k2, %k2
+; SKX-NEXT:    kshiftld $27, %k0, %k0
+; SKX-NEXT:    kshiftrd $27, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftrd $5, %k0, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $26, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
-; SKX-NEXT:    kshiftrd $6, %k0, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $25, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kord %k0, %k2, %k0
 ; SKX-NEXT:    kshiftrd $7, %k0, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kshiftld $7, %k2, %k2
+; SKX-NEXT:    kshiftld $26, %k0, %k0
+; SKX-NEXT:    kshiftrd $26, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $24, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kshiftrd $25, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kshiftrd $8, %k0, %k2
+; SKX-NEXT:    kshiftld $8, %k2, %k2
+; SKX-NEXT:    kshiftld $25, %k0, %k0
+; SKX-NEXT:    kshiftrd $25, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftrd $8, %k0, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $23, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kshiftrd $24, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
 ; SKX-NEXT:    kshiftrd $9, %k0, %k2
+; SKX-NEXT:    kshiftld $9, %k2, %k2
+; SKX-NEXT:    kshiftld $24, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $22, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kshiftrd $24, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k3, %k2
+; SKX-NEXT:    kshiftrd $23, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
 ; SKX-NEXT:    kshiftrd $10, %k0, %k2
+; SKX-NEXT:    kshiftld $10, %k2, %k2
+; SKX-NEXT:    kshiftld $23, %k0, %k0
+; SKX-NEXT:    kshiftrd $23, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k3, %k2
+; SKX-NEXT:    kshiftrd $22, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kshiftrd $11, %k0, %k2
+; SKX-NEXT:    kshiftld $11, %k2, %k2
+; SKX-NEXT:    kshiftld $22, %k0, %k0
+; SKX-NEXT:    kshiftrd $22, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $21, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kshiftrd $12, %k0, %k2
+; SKX-NEXT:    kshiftld $12, %k2, %k2
+; SKX-NEXT:    kshiftld $21, %k0, %k0
+; SKX-NEXT:    kshiftrd $21, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftrd $11, %k0, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $20, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
-; SKX-NEXT:    kshiftrd $12, %k0, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $19, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kord %k0, %k2, %k0
 ; SKX-NEXT:    kshiftrd $13, %k0, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kshiftld $13, %k2, %k2
+; SKX-NEXT:    kshiftld $20, %k0, %k0
+; SKX-NEXT:    kshiftrd $20, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $18, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kshiftrd $19, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kshiftrd $14, %k0, %k2
+; SKX-NEXT:    kshiftld $14, %k2, %k2
+; SKX-NEXT:    kshiftld $19, %k0, %k0
+; SKX-NEXT:    kshiftrd $19, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftrd $14, %k0, %k3
-; SKX-NEXT:    kxord %k2, %k3, %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $17, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kshiftrd $18, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
 ; SKX-NEXT:    kshiftrd $15, %k0, %k2
+; SKX-NEXT:    kshiftld $15, %k2, %k2
+; SKX-NEXT:    kshiftld $18, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $16, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kshiftrd $18, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k3, %k2
+; SKX-NEXT:    kshiftrd $17, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
 ; SKX-NEXT:    kshiftrd $16, %k0, %k2
+; SKX-NEXT:    kshiftld $16, %k2, %k2
+; SKX-NEXT:    kshiftld $17, %k0, %k0
+; SKX-NEXT:    kshiftrd $17, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxord %k3, %k2, %k2
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k3, %k2
+; SKX-NEXT:    kshiftrd $16, %k2, %k2
+; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kshiftrd $17, %k0, %k2
+; SKX-NEXT:    kshiftld $17, %k2, %k2
+; SKX-NEXT:    kshiftld $16, %k0, %k0
+; SKX-NEXT:    kshiftrd $16, %k0, %k0
+; SKX-NEXT:    kord %k2, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftld $31, %k2, %k2
 ; SKX-NEXT:    kshiftrd $15, %k2, %k2
-; SKX-NEXT:    kxord %k2, %k0, %k0
+; SKX-NEXT:    kord %k0, %k2, %k0
 ; SKX-NEXT:    kandd %k1, %k0, %k0
 ; SKX-NEXT:    kshiftrd $16, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r8d
@@ -1113,215 +1259,262 @@ define <17 x i1> @test16(<17 x i1> %a, <
 ; KNL_X32-NEXT:    pushl %ebx
 ; KNL_X32-NEXT:    pushl %edi
 ; KNL_X32-NEXT:    pushl %esi
+; KNL_X32-NEXT:    subl $20, %esp
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k1
-; KNL_X32-NEXT:    kshiftrw $14, %k1, %k1
-; KNL_X32-NEXT:    kxorw %k1, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $2, %k2, %k3
-; KNL_X32-NEXT:    kxorw %k0, %k3, %k0
 ; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    kxorw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $2, %k0, %k2
+; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $1, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $3, %k0, %k2
+; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $2, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $4, %k0, %k2
+; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $3, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $5, %k0, %k2
+; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $4, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $6, %k0, %k2
+; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $5, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $7, %k0, %k2
+; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $6, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $9, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $9, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $7, %k1, %k1
+; KNL_X32-NEXT:    kshiftlw $8, %k0, %k2
+; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $8, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $8, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $8, %k1, %k1
+; KNL_X32-NEXT:    kshiftlw $9, %k0, %k2
+; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $7, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $9, %k1, %k1
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k2
+; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $6, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $6, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $10, %k1, %k1
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k6
+; KNL_X32-NEXT:    korw %k1, %k6, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $5, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $5, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $11, %k1, %k1
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k5
+; KNL_X32-NEXT:    korw %k1, %k5, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $4, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $12, %k1, %k1
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k4
+; KNL_X32-NEXT:    korw %k1, %k4, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $3, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $13, %k1, %k0
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k3
+; KNL_X32-NEXT:    korw %k0, %k3, %k0
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftlw $2, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $2, %k0, %k2
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $8, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $8, %k0, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k1
+; KNL_X32-NEXT:    korw %k0, %k1, %k0
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $7, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $9, %k0, %k2
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %k0, (%esp) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $6, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $5, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $4, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $3, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $2, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $14, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $1, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $15, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $2, %k2, %k2
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL_X32-NEXT:    korw %k2, %k7, %k2
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL_X32-NEXT:    korw %k2, %k7, %k2
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k1, %k3, %k1
-; KNL_X32-NEXT:    kshiftrw $2, %k1, %k3
-; KNL_X32-NEXT:    kxorw %k2, %k3, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $3, %k1, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $4, %k1, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $5, %k1, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $6, %k1, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL_X32-NEXT:    korw %k2, %k7, %k2
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $7, %k1, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL_X32-NEXT:    korw %k2, %k7, %k2
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $8, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL_X32-NEXT:    korw %k2, %k7, %k2
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $7, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $9, %k1, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL_X32-NEXT:    korw %k2, %k7, %k2
+; KNL_X32-NEXT:    kshiftlw $9, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $9, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $6, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $10, %k1, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $8, %k2, %k2
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL_X32-NEXT:    korw %k2, %k7, %k2
+; KNL_X32-NEXT:    kshiftlw $8, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $8, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $5, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $11, %k1, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $9, %k2, %k2
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL_X32-NEXT:    korw %k2, %k7, %k2
+; KNL_X32-NEXT:    kshiftlw $7, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $4, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $12, %k1, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $10, %k2, %k2
+; KNL_X32-NEXT:    korw %k2, %k6, %k2
+; KNL_X32-NEXT:    kshiftlw $6, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $6, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $3, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $13, %k1, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $11, %k2, %k2
+; KNL_X32-NEXT:    korw %k2, %k5, %k2
+; KNL_X32-NEXT:    kshiftlw $5, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $5, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $2, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $14, %k1, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $12, %k2, %k2
+; KNL_X32-NEXT:    korw %k2, %k4, %k2
+; KNL_X32-NEXT:    kshiftlw $4, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $4, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $14, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
-; KNL_X32-NEXT:    kshiftlw $1, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $1, %k1, %k1
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $13, %k2, %k2
+; KNL_X32-NEXT:    korw %k2, %k3, %k2
+; KNL_X32-NEXT:    kshiftlw $3, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $3, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $14, %k2, %k2
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
 ; KNL_X32-NEXT:    xorl %eax, %eax
 ; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    movl $65535, %ecx ## imm = 0xFFFF
 ; KNL_X32-NEXT:    movl $0, %edx
 ; KNL_X32-NEXT:    cmovnel %ecx, %edx
+; KNL_X32-NEXT:    kshiftlw $2, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $2, %k0, %k0
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $1, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; KNL_X32-NEXT:    kmovw %ebx, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    cmovnel %ecx, %eax
-; KNL_X32-NEXT:    kandw %k0, %k1, %k0
-; KNL_X32-NEXT:    kmovw %edx, %k1
+; KNL_X32-NEXT:    kmovw (%esp), %k2 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kandw %k1, %k2, %k1
 ; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1403,6 +1596,7 @@ define <17 x i1> @test16(<17 x i1> %a, <
 ; KNL_X32-NEXT:    orl %esi, %ecx
 ; KNL_X32-NEXT:    orl %edx, %ecx
 ; KNL_X32-NEXT:    movw %cx, (%eax)
+; KNL_X32-NEXT:    addl $20, %esp
 ; KNL_X32-NEXT:    popl %esi
 ; KNL_X32-NEXT:    popl %edi
 ; KNL_X32-NEXT:    popl %ebx
@@ -1416,356 +1610,550 @@ define <7 x i1> @test17(<7 x i1> %a, <7
 ; KNL-LABEL: test17:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k2
-; KNL-NEXT:    kshiftlw $15, %k0, %k1
-; KNL-NEXT:    kshiftrw $14, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k3
-; KNL-NEXT:    kxorw %k0, %k3, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $14, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftlw $3, %k2, %k2
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftlw $4, %k2, %k2
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftlw $5, %k2, %k2
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kshiftrw $6, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $10, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k2
+; KNL-NEXT:    kshiftlw $7, %k2, %k2
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k2
-; KNL-NEXT:    kxorw %k1, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k3
-; KNL-NEXT:    kxorw %k0, %k3, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $14, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftlw $3, %k2, %k2
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftlw $4, %k2, %k2
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftlw $5, %k2, %k2
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kshiftrw $6, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $10, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k2
+; KNL-NEXT:    kshiftlw $7, %k2, %k2
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k3
-; KNL-NEXT:    kxorw %k1, %k3, %k3
-; KNL-NEXT:    kshiftrw $2, %k3, %k4
-; KNL-NEXT:    kxorw %k0, %k4, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k3, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k3
+; KNL-NEXT:    kshiftlw $15, %k3, %k3
+; KNL-NEXT:    kshiftrw $14, %k3, %k3
+; KNL-NEXT:    korw %k0, %k3, %k0
 ; KNL-NEXT:    kshiftrw $3, %k0, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftlw $3, %k3, %k3
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $12, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k3, %k3
+; KNL-NEXT:    korw %k0, %k3, %k0
 ; KNL-NEXT:    kshiftrw $4, %k0, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftlw $4, %k3, %k3
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $11, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k3, %k3
+; KNL-NEXT:    korw %k0, %k3, %k0
 ; KNL-NEXT:    kshiftrw $5, %k0, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftlw $5, %k3, %k3
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $10, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k3, %k3
+; KNL-NEXT:    korw %k0, %k3, %k0
 ; KNL-NEXT:    kshiftrw $6, %k0, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
+; KNL-NEXT:    kshiftlw $6, %k3, %k3
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k3
+; KNL-NEXT:    kshiftlw $15, %k3, %k3
+; KNL-NEXT:    kshiftrw $10, %k3, %k3
+; KNL-NEXT:    korw %k0, %k3, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k3
+; KNL-NEXT:    kshiftlw $7, %k3, %k3
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $9, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k4
-; KNL-NEXT:    kxorw %k1, %k4, %k4
-; KNL-NEXT:    kshiftrw $2, %k4, %k5
-; KNL-NEXT:    kxorw %k0, %k5, %k0
+; KNL-NEXT:    korw %k0, %k3, %k3
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k4, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $14, %k4, %k4
+; KNL-NEXT:    korw %k0, %k4, %k0
 ; KNL-NEXT:    kshiftrw $3, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $3, %k4, %k4
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $12, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k4, %k4
+; KNL-NEXT:    korw %k0, %k4, %k0
 ; KNL-NEXT:    kshiftrw $4, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $4, %k4, %k4
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $11, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k4, %k4
+; KNL-NEXT:    korw %k0, %k4, %k0
 ; KNL-NEXT:    kshiftrw $5, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $5, %k4, %k4
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $10, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k4, %k4
+; KNL-NEXT:    korw %k0, %k4, %k0
 ; KNL-NEXT:    kshiftrw $6, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
+; KNL-NEXT:    kshiftlw $6, %k4, %k4
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $10, %k4, %k4
+; KNL-NEXT:    korw %k0, %k4, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k4
+; KNL-NEXT:    kshiftlw $7, %k4, %k4
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    kshiftrw $9, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k5
-; KNL-NEXT:    kxorw %k1, %k5, %k5
-; KNL-NEXT:    kshiftrw $2, %k5, %k6
-; KNL-NEXT:    kxorw %k0, %k6, %k0
+; KNL-NEXT:    korw %k0, %k4, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k5, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k5
+; KNL-NEXT:    kshiftlw $15, %k5, %k5
+; KNL-NEXT:    kshiftrw $14, %k5, %k5
+; KNL-NEXT:    korw %k0, %k5, %k0
 ; KNL-NEXT:    kshiftrw $3, %k0, %k5
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k6
-; KNL-NEXT:    kxorw %k6, %k5, %k5
+; KNL-NEXT:    kshiftlw $3, %k5, %k5
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k5
 ; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $12, %k5, %k5
-; KNL-NEXT:    kxorw %k5, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k5, %k5
+; KNL-NEXT:    korw %k0, %k5, %k0
 ; KNL-NEXT:    kshiftrw $4, %k0, %k5
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k6
-; KNL-NEXT:    kxorw %k6, %k5, %k5
+; KNL-NEXT:    kshiftlw $4, %k5, %k5
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k5
 ; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $11, %k5, %k5
-; KNL-NEXT:    kxorw %k5, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k5, %k5
+; KNL-NEXT:    korw %k0, %k5, %k0
 ; KNL-NEXT:    kshiftrw $5, %k0, %k5
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k6
-; KNL-NEXT:    kxorw %k6, %k5, %k5
+; KNL-NEXT:    kshiftlw $5, %k5, %k5
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k5
 ; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $10, %k5, %k5
-; KNL-NEXT:    kxorw %k5, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k5, %k5
+; KNL-NEXT:    korw %k0, %k5, %k0
 ; KNL-NEXT:    kshiftrw $6, %k0, %k5
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k6
-; KNL-NEXT:    kxorw %k6, %k5, %k5
+; KNL-NEXT:    kshiftlw $6, %k5, %k5
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k5
+; KNL-NEXT:    kshiftlw $15, %k5, %k5
+; KNL-NEXT:    kshiftrw $10, %k5, %k5
+; KNL-NEXT:    korw %k0, %k5, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k5
+; KNL-NEXT:    kshiftlw $7, %k5, %k5
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k5
 ; KNL-NEXT:    kshiftlw $15, %k5, %k5
 ; KNL-NEXT:    kshiftrw $9, %k5, %k5
-; KNL-NEXT:    kxorw %k5, %k0, %k5
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k6
-; KNL-NEXT:    kxorw %k1, %k6, %k6
-; KNL-NEXT:    kshiftrw $2, %k6, %k7
-; KNL-NEXT:    kxorw %k0, %k7, %k0
+; KNL-NEXT:    korw %k0, %k5, %k5
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k6, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $14, %k6, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
 ; KNL-NEXT:    kshiftrw $3, %k0, %k6
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kxorw %k7, %k6, %k6
+; KNL-NEXT:    kshiftlw $3, %k6, %k6
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $12, %k6, %k6
-; KNL-NEXT:    kxorw %k6, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
 ; KNL-NEXT:    kshiftrw $4, %k0, %k6
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kxorw %k7, %k6, %k6
+; KNL-NEXT:    kshiftlw $4, %k6, %k6
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $11, %k6, %k6
-; KNL-NEXT:    kxorw %k6, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
 ; KNL-NEXT:    kshiftrw $5, %k0, %k6
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kxorw %k7, %k6, %k6
+; KNL-NEXT:    kshiftlw $5, %k6, %k6
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $10, %k6, %k6
-; KNL-NEXT:    kxorw %k6, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
 ; KNL-NEXT:    kshiftrw $6, %k0, %k6
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kxorw %k7, %k6, %k6
+; KNL-NEXT:    kshiftlw $6, %k6, %k6
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k6
+; KNL-NEXT:    kshiftlw $7, %k6, %k6
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
 ; KNL-NEXT:    kshiftrw $9, %k6, %k6
-; KNL-NEXT:    kxorw %k6, %k0, %k6
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; KNL-NEXT:    korw %k0, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kshiftrw $2, %k0, %k2
-; KNL-NEXT:    kxorw %k7, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k0, %k7, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k7
+; KNL-NEXT:    kshiftlw $3, %k7, %k7
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k0, %k7, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k7
+; KNL-NEXT:    kshiftlw $4, %k7, %k7
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k0, %k7, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k7
+; KNL-NEXT:    kshiftlw $5, %k7, %k7
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k0, %k7, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k7
+; KNL-NEXT:    kshiftlw $6, %k7, %k7
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k0, %k7, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k7
+; KNL-NEXT:    kshiftlw $7, %k7, %k7
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k0, %k7, %k7
+; KNL-NEXT:    kmovw %esi, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %edx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; KNL-NEXT:    kshiftrw $14, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
 ; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kxorw %k7, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kxorw %k7, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kxorw %k7, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kshiftrw $6, %k0, %k2
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kxorw %k7, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k7
-; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k2, %k2
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftrw $2, %k0, %k3
-; KNL-NEXT:    kxorw %k2, %k3, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    kmovw %r8d, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k2
+; KNL-NEXT:    kshiftlw $4, %k2, %k2
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %r8d, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    kmovw %r9d, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k2
+; KNL-NEXT:    kshiftlw $5, %k2, %k2
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %r9d, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k2
+; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k2
+; KNL-NEXT:    kshiftlw $7, %k2, %k2
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kshiftrw $6, %k0, %k2
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    korw %k1, %k2, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kxorw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $14, %k2, %k2
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftrw $3, %k1, %k2
+; KNL-NEXT:    kshiftlw $3, %k2, %k2
+; KNL-NEXT:    kshiftlw $14, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k1
+; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftrw $2, %k1, %k3
-; KNL-NEXT:    kxorw %k2, %k3, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftrw $4, %k1, %k2
+; KNL-NEXT:    kshiftlw $4, %k2, %k2
+; KNL-NEXT:    kshiftlw $13, %k1, %k1
+; KNL-NEXT:    kshiftrw $13, %k1, %k1
+; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kshiftrw $3, %k1, %k2
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftrw $5, %k1, %k2
+; KNL-NEXT:    kshiftlw $5, %k2, %k2
+; KNL-NEXT:    kshiftlw $12, %k1, %k1
+; KNL-NEXT:    kshiftrw $12, %k1, %k1
+; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kshiftrw $4, %k1, %k2
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftrw $6, %k1, %k2
+; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    kshiftlw $11, %k1, %k1
+; KNL-NEXT:    kshiftrw $11, %k1, %k1
+; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kshiftrw $5, %k1, %k2
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftrw $7, %k1, %k2
+; KNL-NEXT:    kshiftlw $7, %k2, %k2
+; KNL-NEXT:    kshiftlw $10, %k1, %k1
+; KNL-NEXT:    kshiftrw $10, %k1, %k1
+; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kshiftrw $6, %k1, %k2
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kandw %k7, %k0, %k0
 ; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
@@ -1808,300 +2196,488 @@ define <7 x i1> @test17(<7 x i1> %a, <7
 ; SKX-LABEL: test17:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movq %rdi, %rax
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
 ; SKX-NEXT:    kshiftlb $7, %k0, %k0
-; SKX-NEXT:    kshiftrb $6, %k0, %k0
-; SKX-NEXT:    kxorb %k0, %k2, %k2
-; SKX-NEXT:    kshiftrb $2, %k2, %k3
-; SKX-NEXT:    kxorb %k1, %k3, %k1
-; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $5, %k1, %k1
-; SKX-NEXT:    kxorb %k1, %k2, %k1
+; SKX-NEXT:    kshiftrb $7, %k0, %k1
+; SKX-NEXT:    kshiftlb $2, %k0, %k0
+; SKX-NEXT:    korb %k0, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $6, %k2, %k2
+; SKX-NEXT:    korb %k1, %k2, %k1
 ; SKX-NEXT:    kshiftrb $3, %k1, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    kshiftlb $3, %k2, %k2
+; SKX-NEXT:    kshiftlb $6, %k1, %k1
+; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $4, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxorb %k2, %k1, %k1
+; SKX-NEXT:    kshiftrb $5, %k2, %k2
+; SKX-NEXT:    korb %k1, %k2, %k1
 ; SKX-NEXT:    kshiftrb $4, %k1, %k2
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    kshiftlb $4, %k2, %k2
+; SKX-NEXT:    kshiftlb $5, %k1, %k1
+; SKX-NEXT:    kshiftrb $5, %k1, %k1
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $3, %k2, %k2
-; SKX-NEXT:    kxorb %k2, %k1, %k1
+; SKX-NEXT:    kshiftrb $4, %k2, %k2
+; SKX-NEXT:    korb %k1, %k2, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k2
+; SKX-NEXT:    kshiftlb $5, %k2, %k2
+; SKX-NEXT:    kshiftlb $4, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $2, %k2, %k2
-; SKX-NEXT:    kxorb %k2, %k1, %k1
+; SKX-NEXT:    kshiftrb $4, %k1, %k1
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k3, %k2
+; SKX-NEXT:    kshiftrb $3, %k2, %k2
+; SKX-NEXT:    korb %k1, %k2, %k1
 ; SKX-NEXT:    kshiftrb $6, %k1, %k2
+; SKX-NEXT:    kshiftlb $6, %k2, %k2
+; SKX-NEXT:    kshiftlb $3, %k1, %k1
+; SKX-NEXT:    kshiftrb $3, %k1, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k3, %k2
+; SKX-NEXT:    kshiftrb $2, %k2, %k2
+; SKX-NEXT:    korb %k1, %k2, %k1
+; SKX-NEXT:    kshiftrb $7, %k1, %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftlb $2, %k1, %k1
+; SKX-NEXT:    kshiftrb $2, %k1, %k1
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftlb $7, %k2, %k2
 ; SKX-NEXT:    kshiftrb $1, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kxorb %k2, %k1, %k1
+; SKX-NEXT:    korb %k1, %k2, %k1
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kxorb %k0, %k2, %k2
-; SKX-NEXT:    kshiftrb $2, %k2, %k4
-; SKX-NEXT:    kxorb %k3, %k4, %k3
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $7, %k2, %k2
+; SKX-NEXT:    korb %k0, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    kshiftrb $6, %k3, %k3
+; SKX-NEXT:    korb %k2, %k3, %k2
 ; SKX-NEXT:    kshiftrb $3, %k2, %k3
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    kshiftlb $3, %k3, %k3
+; SKX-NEXT:    kshiftlb $6, %k2, %k2
+; SKX-NEXT:    kshiftrb $6, %k2, %k2
+; SKX-NEXT:    korb %k3, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $4, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    kshiftrb $5, %k3, %k3
+; SKX-NEXT:    korb %k2, %k3, %k2
 ; SKX-NEXT:    kshiftrb $4, %k2, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    kshiftlb $4, %k3, %k3
+; SKX-NEXT:    kshiftlb $5, %k2, %k2
+; SKX-NEXT:    kshiftrb $5, %k2, %k2
+; SKX-NEXT:    korb %k3, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $3, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    kshiftrb $4, %k3, %k3
+; SKX-NEXT:    korb %k2, %k3, %k2
 ; SKX-NEXT:    kshiftrb $5, %k2, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    kshiftlb $5, %k3, %k3
+; SKX-NEXT:    kshiftlb $4, %k2, %k2
+; SKX-NEXT:    kshiftrb $4, %k2, %k2
+; SKX-NEXT:    korb %k3, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $2, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    kshiftrb $3, %k3, %k3
+; SKX-NEXT:    korb %k2, %k3, %k2
 ; SKX-NEXT:    kshiftrb $6, %k2, %k3
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    kshiftlb $6, %k3, %k3
+; SKX-NEXT:    kshiftlb $3, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
+; SKX-NEXT:    kshiftrb $3, %k2, %k2
+; SKX-NEXT:    korb %k3, %k2, %k2
+; SKX-NEXT:    kshiftlb $7, %k4, %k3
+; SKX-NEXT:    kshiftrb $2, %k3, %k3
+; SKX-NEXT:    korb %k2, %k3, %k2
+; SKX-NEXT:    kshiftrb $7, %k2, %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
+; SKX-NEXT:    kshiftlb $2, %k2, %k2
+; SKX-NEXT:    kshiftrb $2, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
+; SKX-NEXT:    korb %k3, %k2, %k2
+; SKX-NEXT:    kshiftlb $7, %k4, %k3
 ; SKX-NEXT:    kshiftrb $1, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
+; SKX-NEXT:    korb %k2, %k3, %k2
 ; SKX-NEXT:    kandb %k1, %k2, %k1
-; SKX-NEXT:    kxorb %k0, %k4, %k2
-; SKX-NEXT:    kshiftrb $2, %k2, %k4
-; SKX-NEXT:    kxorb %k3, %k4, %k3
+; SKX-NEXT:    kshiftlb $7, %k4, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT:    kshiftrb $7, %k2, %k2
+; SKX-NEXT:    korb %k0, %k2, %k2
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    kshiftrb $6, %k3, %k3
+; SKX-NEXT:    korb %k2, %k3, %k2
 ; SKX-NEXT:    kshiftrb $3, %k2, %k3
+; SKX-NEXT:    kshiftlb $3, %k3, %k3
+; SKX-NEXT:    kshiftlb $6, %k2, %k2
+; SKX-NEXT:    kshiftrb $6, %k2, %k2
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    korb %k3, %k2, %k2
+; SKX-NEXT:    kshiftlb $7, %k4, %k3
+; SKX-NEXT:    kshiftrb $5, %k3, %k3
+; SKX-NEXT:    korb %k2, %k3, %k2
+; SKX-NEXT:    kshiftrb $4, %k2, %k3
+; SKX-NEXT:    kshiftlb $4, %k3, %k3
+; SKX-NEXT:    kshiftlb $5, %k2, %k2
+; SKX-NEXT:    kshiftrb $5, %k2, %k2
+; SKX-NEXT:    korb %k3, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
 ; SKX-NEXT:    kshiftrb $4, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    korb %k2, %k3, %k2
+; SKX-NEXT:    kshiftrb $5, %k2, %k3
+; SKX-NEXT:    kshiftlb $5, %k3, %k3
+; SKX-NEXT:    kshiftlb $4, %k2, %k2
+; SKX-NEXT:    kshiftrb $4, %k2, %k2
+; SKX-NEXT:    korb %k3, %k2, %k2
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftrb $4, %k2, %k4
-; SKX-NEXT:    kxorb %k3, %k4, %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
 ; SKX-NEXT:    kshiftrb $3, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
-; SKX-NEXT:    kshiftrb $5, %k2, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    korb %k2, %k3, %k2
+; SKX-NEXT:    kshiftrb $6, %k2, %k3
+; SKX-NEXT:    kshiftlb $6, %k3, %k3
+; SKX-NEXT:    kshiftlb $3, %k2, %k2
+; SKX-NEXT:    kshiftrb $3, %k2, %k2
+; SKX-NEXT:    korb %k3, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
 ; SKX-NEXT:    kshiftrb $2, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    korb %k2, %k3, %k2
+; SKX-NEXT:    kshiftrb $7, %k2, %k3
+; SKX-NEXT:    kshiftlb $7, %k3, %k3
+; SKX-NEXT:    kshiftlb $2, %k2, %k2
+; SKX-NEXT:    kshiftrb $2, %k2, %k2
+; SKX-NEXT:    korb %k3, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
 ; SKX-NEXT:    kshiftrb $1, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k2, %k2
+; SKX-NEXT:    korb %k2, %k3, %k2
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k0, %k4, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k5
-; SKX-NEXT:    kxorb %k3, %k5, %k3
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $7, %k3, %k3
+; SKX-NEXT:    korb %k0, %k3, %k3
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftrb $3, %k3, %k5
-; SKX-NEXT:    kxorb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    kshiftrb $6, %k4, %k4
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $3, %k3, %k4
+; SKX-NEXT:    kshiftlb $3, %k4, %k4
+; SKX-NEXT:    kshiftlb $6, %k3, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
+; SKX-NEXT:    kshiftrb $6, %k3, %k3
+; SKX-NEXT:    korb %k4, %k3, %k3
+; SKX-NEXT:    kshiftlb $7, %k5, %k4
+; SKX-NEXT:    kshiftrb $5, %k4, %k4
+; SKX-NEXT:    korb %k3, %k4, %k3
 ; SKX-NEXT:    kshiftrb $4, %k3, %k4
+; SKX-NEXT:    kshiftlb $4, %k4, %k4
+; SKX-NEXT:    kshiftlb $5, %k3, %k3
+; SKX-NEXT:    kshiftrb $5, %k3, %k3
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k4, %k3, %k3
+; SKX-NEXT:    kshiftlb $7, %k5, %k4
+; SKX-NEXT:    kshiftrb $4, %k4, %k4
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $5, %k3, %k4
+; SKX-NEXT:    kshiftlb $5, %k4, %k4
+; SKX-NEXT:    kshiftlb $4, %k3, %k3
+; SKX-NEXT:    kshiftrb $4, %k3, %k3
+; SKX-NEXT:    korb %k4, %k3, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
 ; SKX-NEXT:    kshiftrb $3, %k4, %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $6, %k3, %k4
+; SKX-NEXT:    kshiftlb $6, %k4, %k4
+; SKX-NEXT:    kshiftlb $3, %k3, %k3
+; SKX-NEXT:    kshiftrb $3, %k3, %k3
+; SKX-NEXT:    korb %k4, %k3, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
 ; SKX-NEXT:    kshiftrb $2, %k4, %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $7, %k3, %k4
+; SKX-NEXT:    kshiftlb $7, %k4, %k4
+; SKX-NEXT:    kshiftlb $2, %k3, %k3
+; SKX-NEXT:    kshiftrb $2, %k3, %k3
+; SKX-NEXT:    korb %k4, %k3, %k3
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftrb $6, %k3, %k5
-; SKX-NEXT:    kxorb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftrb $1, %k4, %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
+; SKX-NEXT:    korb %k3, %k4, %k3
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k0, %k4, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k6
-; SKX-NEXT:    kxorb %k5, %k6, %k5
+; SKX-NEXT:    kshiftlb $7, %k4, %k4
+; SKX-NEXT:    kshiftrb $7, %k4, %k4
+; SKX-NEXT:    korb %k0, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $5, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    kshiftrb $6, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftrb $3, %k4, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    kshiftlb $3, %k5, %k5
+; SKX-NEXT:    kshiftlb $6, %k4, %k4
+; SKX-NEXT:    kshiftrb $6, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $4, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    kshiftrb $5, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftrb $4, %k4, %k5
+; SKX-NEXT:    kshiftlb $4, %k5, %k5
+; SKX-NEXT:    kshiftlb $5, %k4, %k4
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $5, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kshiftlb $7, %k6, %k5
+; SKX-NEXT:    kshiftrb $4, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
+; SKX-NEXT:    kshiftrb $5, %k4, %k5
+; SKX-NEXT:    kshiftlb $5, %k5, %k5
+; SKX-NEXT:    kshiftlb $4, %k4, %k4
+; SKX-NEXT:    kshiftrb $4, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kshiftlb $7, %k6, %k5
 ; SKX-NEXT:    kshiftrb $3, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k4, %k5, %k4
+; SKX-NEXT:    kshiftrb $6, %k4, %k5
+; SKX-NEXT:    kshiftlb $6, %k5, %k5
+; SKX-NEXT:    kshiftlb $3, %k4, %k4
+; SKX-NEXT:    kshiftrb $3, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftrb $5, %k4, %k6
-; SKX-NEXT:    kxorb %k5, %k6, %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $2, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
+; SKX-NEXT:    kshiftrb $7, %k4, %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftlb $2, %k4, %k4
+; SKX-NEXT:    kshiftrb $2, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $1, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k4, %k5, %k4
 ; SKX-NEXT:    kandb %k3, %k4, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
 ; SKX-NEXT:    kandb %k2, %k3, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT:    kshiftlb $7, %k4, %k3
+; SKX-NEXT:    kshiftrb $7, %k3, %k3
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k0, %k4, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k5
-; SKX-NEXT:    kxorb %k3, %k5, %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
+; SKX-NEXT:    korb %k0, %k3, %k3
+; SKX-NEXT:    kshiftlb $7, %k4, %k4
+; SKX-NEXT:    kshiftrb $6, %k4, %k4
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $3, %k3, %k4
+; SKX-NEXT:    kshiftlb $3, %k4, %k4
+; SKX-NEXT:    kshiftlb $6, %k3, %k3
+; SKX-NEXT:    kshiftrb $6, %k3, %k3
+; SKX-NEXT:    korb %k4, %k3, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
+; SKX-NEXT:    kshiftlb $7, %k4, %k4
+; SKX-NEXT:    kshiftrb $5, %k4, %k4
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $4, %k3, %k4
+; SKX-NEXT:    kshiftlb $4, %k4, %k4
+; SKX-NEXT:    kshiftlb $5, %k3, %k3
 ; SKX-NEXT:    kshiftrb $5, %k3, %k3
-; SKX-NEXT:    kxorb %k3, %k4, %k3
+; SKX-NEXT:    korb %k4, %k3, %k3
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftrb $3, %k3, %k5
-; SKX-NEXT:    kxorb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
 ; SKX-NEXT:    kshiftrb $4, %k4, %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
-; SKX-NEXT:    kshiftrb $4, %k3, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $5, %k3, %k4
+; SKX-NEXT:    kshiftlb $5, %k4, %k4
+; SKX-NEXT:    kshiftlb $4, %k3, %k3
+; SKX-NEXT:    kshiftrb $4, %k3, %k3
+; SKX-NEXT:    korb %k4, %k3, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
 ; SKX-NEXT:    kshiftrb $3, %k4, %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $6, %k3, %k4
+; SKX-NEXT:    kshiftlb $6, %k4, %k4
+; SKX-NEXT:    kshiftlb $3, %k3, %k3
+; SKX-NEXT:    kshiftrb $3, %k3, %k3
+; SKX-NEXT:    korb %k4, %k3, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
 ; SKX-NEXT:    kshiftrb $2, %k4, %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftrb $6, %k3, %k5
-; SKX-NEXT:    kxorb %k4, %k5, %k4
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftrb $7, %k3, %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
+; SKX-NEXT:    kshiftlb $2, %k3, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
+; SKX-NEXT:    kshiftrb $2, %k3, %k3
+; SKX-NEXT:    korb %k4, %k3, %k3
+; SKX-NEXT:    kshiftlb $7, %k5, %k4
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftrb $1, %k4, %k4
-; SKX-NEXT:    kxorb %k4, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kxorb %k0, %k4, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k6
-; SKX-NEXT:    kxorb %k5, %k6, %k5
+; SKX-NEXT:    korb %k3, %k4, %k3
+; SKX-NEXT:    kshiftlb $7, %k5, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
+; SKX-NEXT:    kshiftrb $7, %k4, %k4
+; SKX-NEXT:    korb %k0, %k4, %k4
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $5, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    kshiftrb $6, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftrb $3, %k4, %k5
+; SKX-NEXT:    kshiftlb $3, %k5, %k5
+; SKX-NEXT:    kshiftlb $6, %k4, %k4
+; SKX-NEXT:    kshiftrb $6, %k4, %k4
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kshiftlb $7, %k6, %k5
+; SKX-NEXT:    kshiftrb $5, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
+; SKX-NEXT:    kshiftrb $4, %k4, %k5
+; SKX-NEXT:    kshiftlb $4, %k5, %k5
+; SKX-NEXT:    kshiftlb $5, %k4, %k4
+; SKX-NEXT:    kshiftrb $5, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $4, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
+; SKX-NEXT:    kshiftrb $5, %k4, %k5
+; SKX-NEXT:    kshiftlb $5, %k5, %k5
+; SKX-NEXT:    kshiftlb $4, %k4, %k4
+; SKX-NEXT:    kshiftrb $4, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $3, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k4, %k5, %k4
+; SKX-NEXT:    kshiftrb $6, %k4, %k5
+; SKX-NEXT:    kshiftlb $6, %k5, %k5
+; SKX-NEXT:    kshiftlb $3, %k4, %k4
+; SKX-NEXT:    kshiftrb $3, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftrb $5, %k4, %k6
-; SKX-NEXT:    kxorb %k5, %k6, %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $2, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
+; SKX-NEXT:    kshiftrb $7, %k4, %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftlb $2, %k4, %k4
+; SKX-NEXT:    kshiftrb $2, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $1, %k5, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k4, %k5, %k4
 ; SKX-NEXT:    kandb %k3, %k4, %k3
-; SKX-NEXT:    kxorb %k0, %k7, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k5
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
+; SKX-NEXT:    kshiftlb $7, %k4, %k4
+; SKX-NEXT:    kshiftrb $7, %k4, %k4
+; SKX-NEXT:    korb %k0, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $5, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    kshiftrb $6, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftrb $3, %k4, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    kshiftlb $3, %k5, %k5
+; SKX-NEXT:    kshiftlb $6, %k4, %k4
+; SKX-NEXT:    kshiftrb $6, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $4, %k5, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    kshiftrb $5, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftrb $4, %k4, %k5
-; SKX-NEXT:    kxorb %k6, %k5, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $3, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    kshiftlb $4, %k5, %k5
+; SKX-NEXT:    kshiftlb $5, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT:    kshiftrb $5, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kshiftlb $7, %k6, %k5
+; SKX-NEXT:    kshiftrb $4, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftrb $5, %k4, %k5
+; SKX-NEXT:    kshiftlb $5, %k5, %k5
+; SKX-NEXT:    kshiftlb $4, %k4, %k4
+; SKX-NEXT:    kshiftrb $4, %k4, %k4
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $2, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kshiftlb $7, %k6, %k5
+; SKX-NEXT:    kshiftrb $3, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
 ; SKX-NEXT:    kshiftrb $6, %k4, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    kshiftlb $6, %k5, %k5
+; SKX-NEXT:    kshiftlb $3, %k4, %k4
+; SKX-NEXT:    kshiftrb $3, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $2, %k5, %k5
+; SKX-NEXT:    korb %k4, %k5, %k4
+; SKX-NEXT:    kshiftrb $7, %k4, %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftlb $2, %k4, %k4
+; SKX-NEXT:    kshiftrb $2, %k4, %k4
+; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $1, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k4, %k4
+; SKX-NEXT:    korb %k4, %k5, %k4
+; SKX-NEXT:    kmovd %esi, %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $7, %k5, %k5
+; SKX-NEXT:    korb %k0, %k5, %k0
+; SKX-NEXT:    kmovd %edx, %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $6, %k5, %k5
+; SKX-NEXT:    korb %k0, %k5, %k0
+; SKX-NEXT:    kshiftrb $3, %k0, %k5
+; SKX-NEXT:    kshiftlb $3, %k5, %k5
+; SKX-NEXT:    kshiftlb $6, %k0, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
+; SKX-NEXT:    korb %k5, %k0, %k0
 ; SKX-NEXT:    kmovd %ecx, %k5
-; SKX-NEXT:    kmovd %esi, %k6
-; SKX-NEXT:    kxorb %k0, %k6, %k0
-; SKX-NEXT:    kshiftrb $2, %k0, %k6
-; SKX-NEXT:    kxorb %k5, %k6, %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $5, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k0, %k0
-; SKX-NEXT:    kshiftrb $3, %k0, %k5
-; SKX-NEXT:    kmovd %r8d, %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    korb %k0, %k5, %k0
+; SKX-NEXT:    kshiftrb $4, %k0, %k5
+; SKX-NEXT:    kshiftlb $4, %k5, %k5
+; SKX-NEXT:    kshiftlb $5, %k0, %k0
+; SKX-NEXT:    kshiftrb $5, %k0, %k0
+; SKX-NEXT:    korb %k5, %k0, %k0
+; SKX-NEXT:    kmovd %r8d, %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $4, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k0, %k0
-; SKX-NEXT:    kshiftrb $4, %k0, %k5
-; SKX-NEXT:    kmovd %r9d, %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    korb %k0, %k5, %k0
+; SKX-NEXT:    kshiftrb $5, %k0, %k5
+; SKX-NEXT:    kshiftlb $5, %k5, %k5
+; SKX-NEXT:    kshiftlb $4, %k0, %k0
+; SKX-NEXT:    kshiftrb $4, %k0, %k0
+; SKX-NEXT:    korb %k5, %k0, %k0
+; SKX-NEXT:    kmovd %r9d, %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $3, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k0, %k0
-; SKX-NEXT:    kshiftrb $5, %k0, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    korb %k0, %k5, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k5
+; SKX-NEXT:    kshiftlb $6, %k5, %k5
+; SKX-NEXT:    kshiftlb $3, %k0, %k0
+; SKX-NEXT:    kshiftrb $3, %k0, %k0
+; SKX-NEXT:    korb %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $2, %k5, %k5
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kxorb %k5, %k0, %k0
-; SKX-NEXT:    kshiftrb $6, %k0, %k5
-; SKX-NEXT:    kxorb %k6, %k5, %k5
+; SKX-NEXT:    korb %k0, %k5, %k0
+; SKX-NEXT:    kshiftrb $7, %k0, %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftlb $2, %k0, %k0
+; SKX-NEXT:    kshiftrb $2, %k0, %k0
+; SKX-NEXT:    korb %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $1, %k5, %k5
-; SKX-NEXT:    kxorb %k5, %k0, %k0
+; SKX-NEXT:    korb %k0, %k5, %k0
 ; SKX-NEXT:    kandb %k4, %k0, %k0
 ; SKX-NEXT:    kandb %k3, %k0, %k0
 ; SKX-NEXT:    kandb %k2, %k0, %k0
@@ -2144,362 +2720,557 @@ define <7 x i1> @test17(<7 x i1> %a, <7
 ; KNL_X32-LABEL: test17:
 ; KNL_X32:       ## %bb.0:
 ; KNL_X32-NEXT:    pushl %ebx
-; KNL_X32-NEXT:    subl $8, %esp
+; KNL_X32-NEXT:    pushl %eax
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $2, %k0, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k1
-; KNL_X32-NEXT:    kshiftrw $14, %k1, %k1
-; KNL_X32-NEXT:    kxorw %k1, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $2, %k2, %k3
-; KNL_X32-NEXT:    kxorw %k0, %k3, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    kxorw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftrw $14, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    kshiftrw $4, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    kshiftrw $6, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kxorw %k1, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $2, %k2, %k3
-; KNL_X32-NEXT:    kxorw %k0, %k3, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    kxorw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftrw $14, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    kshiftrw $4, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
 ; KNL_X32-NEXT:    kshiftrw $6, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kmovw %k0, (%esp) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k1, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $2, %k3, %k4
-; KNL_X32-NEXT:    kxorw %k0, %k4, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    kxorw %k0, %k3, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
+; KNL_X32-NEXT:    kshiftrw $14, %k3, %k3
+; KNL_X32-NEXT:    korw %k0, %k3, %k0
 ; KNL_X32-NEXT:    kshiftrw $3, %k0, %k3
+; KNL_X32-NEXT:    kshiftlw $3, %k3, %k3
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    korw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kxorw %k4, %k3, %k3
+; KNL_X32-NEXT:    kmovw %eax, %k3
 ; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $12, %k3, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k3, %k3
+; KNL_X32-NEXT:    korw %k0, %k3, %k0
 ; KNL_X32-NEXT:    kshiftrw $4, %k0, %k3
+; KNL_X32-NEXT:    kshiftlw $4, %k3, %k3
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    korw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kxorw %k4, %k3, %k3
+; KNL_X32-NEXT:    kmovw %eax, %k3
 ; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $11, %k3, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k3, %k3
+; KNL_X32-NEXT:    korw %k0, %k3, %k0
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k3
+; KNL_X32-NEXT:    kshiftlw $5, %k3, %k3
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    korw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kxorw %k4, %k3, %k3
+; KNL_X32-NEXT:    kmovw %eax, %k3
 ; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $10, %k3, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k3, %k3
+; KNL_X32-NEXT:    korw %k0, %k3, %k0
 ; KNL_X32-NEXT:    kshiftrw $6, %k0, %k3
+; KNL_X32-NEXT:    kshiftlw $6, %k3, %k3
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    korw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kxorw %k4, %k3, %k3
+; KNL_X32-NEXT:    kmovw %eax, %k3
+; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
+; KNL_X32-NEXT:    kshiftrw $10, %k3, %k3
+; KNL_X32-NEXT:    korw %k0, %k3, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k3
+; KNL_X32-NEXT:    kshiftlw $7, %k3, %k3
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    korw %k3, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k3
 ; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL_X32-NEXT:    kshiftrw $9, %k3, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    korw %k0, %k3, %k3
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kxorw %k1, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $2, %k4, %k5
-; KNL_X32-NEXT:    kxorw %k0, %k5, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    kxorw %k0, %k4, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
+; KNL_X32-NEXT:    kshiftrw $14, %k4, %k4
+; KNL_X32-NEXT:    korw %k0, %k4, %k0
 ; KNL_X32-NEXT:    kshiftrw $3, %k0, %k4
+; KNL_X32-NEXT:    kshiftlw $3, %k4, %k4
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    korw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kxorw %k5, %k4, %k4
+; KNL_X32-NEXT:    kmovw %eax, %k4
 ; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $12, %k4, %k4
-; KNL_X32-NEXT:    kxorw %k4, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k4, %k4
+; KNL_X32-NEXT:    korw %k0, %k4, %k0
 ; KNL_X32-NEXT:    kshiftrw $4, %k0, %k4
+; KNL_X32-NEXT:    kshiftlw $4, %k4, %k4
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    korw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kxorw %k5, %k4, %k4
+; KNL_X32-NEXT:    kmovw %eax, %k4
 ; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $11, %k4, %k4
-; KNL_X32-NEXT:    kxorw %k4, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k4, %k4
+; KNL_X32-NEXT:    korw %k0, %k4, %k0
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k4
+; KNL_X32-NEXT:    kshiftlw $5, %k4, %k4
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    korw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kxorw %k5, %k4, %k4
+; KNL_X32-NEXT:    kmovw %eax, %k4
 ; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $10, %k4, %k4
-; KNL_X32-NEXT:    kxorw %k4, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k4, %k4
+; KNL_X32-NEXT:    korw %k0, %k4, %k0
 ; KNL_X32-NEXT:    kshiftrw $6, %k0, %k4
+; KNL_X32-NEXT:    kshiftlw $6, %k4, %k4
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    korw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kxorw %k5, %k4, %k4
+; KNL_X32-NEXT:    kmovw %eax, %k4
+; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
+; KNL_X32-NEXT:    kshiftrw $10, %k4, %k4
+; KNL_X32-NEXT:    korw %k0, %k4, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k4
+; KNL_X32-NEXT:    kshiftlw $7, %k4, %k4
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    korw %k4, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k4
 ; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL_X32-NEXT:    kshiftrw $9, %k4, %k4
-; KNL_X32-NEXT:    kxorw %k4, %k0, %k4
+; KNL_X32-NEXT:    korw %k0, %k4, %k4
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kxorw %k1, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $2, %k5, %k6
-; KNL_X32-NEXT:    kxorw %k0, %k6, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    kxorw %k0, %k5, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
+; KNL_X32-NEXT:    kshiftrw $14, %k5, %k5
+; KNL_X32-NEXT:    korw %k0, %k5, %k0
 ; KNL_X32-NEXT:    kshiftrw $3, %k0, %k5
+; KNL_X32-NEXT:    kshiftlw $3, %k5, %k5
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    korw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kxorw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw %eax, %k5
 ; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $12, %k5, %k5
-; KNL_X32-NEXT:    kxorw %k5, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k5, %k5
+; KNL_X32-NEXT:    korw %k0, %k5, %k0
 ; KNL_X32-NEXT:    kshiftrw $4, %k0, %k5
+; KNL_X32-NEXT:    kshiftlw $4, %k5, %k5
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    korw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kxorw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw %eax, %k5
 ; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $11, %k5, %k5
-; KNL_X32-NEXT:    kxorw %k5, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k5, %k5
+; KNL_X32-NEXT:    korw %k0, %k5, %k0
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k5
+; KNL_X32-NEXT:    kshiftlw $5, %k5, %k5
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    korw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kxorw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw %eax, %k5
 ; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $10, %k5, %k5
-; KNL_X32-NEXT:    kxorw %k5, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k5, %k5
+; KNL_X32-NEXT:    korw %k0, %k5, %k0
 ; KNL_X32-NEXT:    kshiftrw $6, %k0, %k5
+; KNL_X32-NEXT:    kshiftlw $6, %k5, %k5
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    korw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kxorw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw %eax, %k5
+; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
+; KNL_X32-NEXT:    kshiftrw $10, %k5, %k5
+; KNL_X32-NEXT:    korw %k0, %k5, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k5
+; KNL_X32-NEXT:    kshiftlw $7, %k5, %k5
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    korw %k5, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k5
 ; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
 ; KNL_X32-NEXT:    kshiftrw $9, %k5, %k5
-; KNL_X32-NEXT:    kxorw %k5, %k0, %k5
+; KNL_X32-NEXT:    korw %k0, %k5, %k5
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kxorw %k1, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $2, %k6, %k7
-; KNL_X32-NEXT:    kxorw %k0, %k7, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    kxorw %k0, %k6, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $14, %k6, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
 ; KNL_X32-NEXT:    kshiftrw $3, %k0, %k6
+; KNL_X32-NEXT:    kshiftlw $3, %k6, %k6
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kxorw %k7, %k6, %k6
+; KNL_X32-NEXT:    kmovw %eax, %k6
 ; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
-; KNL_X32-NEXT:    kxorw %k6, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
 ; KNL_X32-NEXT:    kshiftrw $4, %k0, %k6
+; KNL_X32-NEXT:    kshiftlw $4, %k6, %k6
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kxorw %k7, %k6, %k6
+; KNL_X32-NEXT:    kmovw %eax, %k6
 ; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
-; KNL_X32-NEXT:    kxorw %k6, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k6
+; KNL_X32-NEXT:    kshiftlw $5, %k6, %k6
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kxorw %k7, %k6, %k6
+; KNL_X32-NEXT:    kmovw %eax, %k6
 ; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
-; KNL_X32-NEXT:    kxorw %k6, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
 ; KNL_X32-NEXT:    kshiftrw $6, %k0, %k6
+; KNL_X32-NEXT:    kshiftlw $6, %k6, %k6
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kxorw %k7, %k6, %k6
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k6
+; KNL_X32-NEXT:    kshiftlw $7, %k6, %k6
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k6
 ; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
 ; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
-; KNL_X32-NEXT:    kxorw %k6, %k0, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k6
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; KNL_X32-NEXT:    kmovw %ecx, %k0
-; KNL_X32-NEXT:    kxorw %k1, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftrw $2, %k0, %k2
-; KNL_X32-NEXT:    kxorw %k7, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k0, %k7, %k0
+; KNL_X32-NEXT:    kshiftrw $3, %k0, %k7
+; KNL_X32-NEXT:    kshiftlw $3, %k7, %k7
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
 ; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kxorw %k7, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k0, %k7, %k0
+; KNL_X32-NEXT:    kshiftrw $4, %k0, %k7
+; KNL_X32-NEXT:    kshiftlw $4, %k7, %k7
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k2
 ; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kxorw %k7, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k0, %k7, %k0
+; KNL_X32-NEXT:    kshiftrw $5, %k0, %k7
+; KNL_X32-NEXT:    kshiftlw $5, %k7, %k7
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k2
 ; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kxorw %k7, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k0, %k7, %k0
+; KNL_X32-NEXT:    kshiftrw $6, %k0, %k7
+; KNL_X32-NEXT:    kshiftlw $6, %k7, %k7
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k0, %k7, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k7
+; KNL_X32-NEXT:    kshiftlw $7, %k7, %k7
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k2
 ; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kxorw %k7, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k0, %k7, %k7
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k2, %k2
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; KNL_X32-NEXT:    kmovw %ecx, %k0
-; KNL_X32-NEXT:    kxorw %k1, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftrw $2, %k0, %k3
-; KNL_X32-NEXT:    kxorw %k2, %k3, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftrw $4, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftrw $5, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k2
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftrw $6, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k2
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k2
+; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k2
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k0, %k0
+; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftrw $15, %k2, %k2
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kxorw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftrw $14, %k2, %k2
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftrw $3, %k1, %k2
+; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $14, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $14, %k1, %k1
+; KNL_X32-NEXT:    korw %k2, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftrw $2, %k1, %k3
-; KNL_X32-NEXT:    kxorw %k2, %k3, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftrw $4, %k1, %k2
+; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $13, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $13, %k1, %k1
+; KNL_X32-NEXT:    korw %k2, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $3, %k1, %k2
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftrw $5, %k1, %k2
+; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $12, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $12, %k1, %k1
+; KNL_X32-NEXT:    korw %k2, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $4, %k1, %k2
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftrw $6, %k1, %k2
+; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $11, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $11, %k1, %k1
+; KNL_X32-NEXT:    korw %k2, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $5, %k1, %k2
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftrw $7, %k1, %k2
+; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $10, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $10, %k1, %k1
+; KNL_X32-NEXT:    korw %k2, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kshiftrw $6, %k1, %k2
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kxorw %k3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    kxorw %k2, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k2, %k1
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k7, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k4, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw (%esp), %k1 ## 2-byte Reload
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
@@ -2537,7 +3308,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7
 ; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andb $127, %cl
 ; KNL_X32-NEXT:    movb %cl, (%eax)
-; KNL_X32-NEXT:    addl $8, %esp
+; KNL_X32-NEXT:    addl $4, %esp
 ; KNL_X32-NEXT:    popl %ebx
 ; KNL_X32-NEXT:    retl $4
   %j = and <7 x i1> %a, %b

Modified: llvm/trunk/test/CodeGen/X86/avx512-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-ext.ll?rev=373495&r1=373494&r2=373495&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-ext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-ext.ll Wed Oct  2 10:47:09 2019
@@ -1886,410 +1886,495 @@ define void @extload_v8i64(<8 x i8>* %a,
 define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: test21:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    kmovw %edx, %k1
-; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kshiftlw $1, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
 ; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k3
-; KNL-NEXT:    kxorw %k1, %k3, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $13, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k2, %k1
-; KNL-NEXT:    kshiftrw $3, %k1, %k2
-; KNL-NEXT:    kmovw %ecx, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $4, %k1, %k2
-; KNL-NEXT:    kmovw %r8d, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $5, %k1, %k2
-; KNL-NEXT:    kmovw %r9d, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $6, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $7, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $8, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $8, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $7, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $9, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $6, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $10, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $5, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $11, %k1, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $4, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $12, %k1, %k2
+; KNL-NEXT:    kshiftlw $3, %k0, %k3
+; KNL-NEXT:    kmovw %edx, %k1
+; KNL-NEXT:    kshiftlw $2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k3, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    kshiftlw $4, %k0, %k4
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    kshiftlw $3, %k1, %k1
+; KNL-NEXT:    korw %k1, %k4, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k5
+; KNL-NEXT:    kmovw %r8d, %k1
+; KNL-NEXT:    kshiftlw $4, %k1, %k1
+; KNL-NEXT:    korw %k1, %k5, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k6
+; KNL-NEXT:    kmovw %r9d, %k1
+; KNL-NEXT:    kshiftlw $5, %k1, %k1
+; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k7
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $6, %k1, %k1
+; KNL-NEXT:    korw %k1, %k7, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $9, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $7, %k1, %k1
+; KNL-NEXT:    kshiftlw $8, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $8, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $8, %k1, %k1
+; KNL-NEXT:    kshiftlw $9, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $9, %k1, %k1
+; KNL-NEXT:    kshiftlw $10, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $10, %k1, %k1
+; KNL-NEXT:    kshiftlw $11, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $11, %k1, %k1
+; KNL-NEXT:    kshiftlw $12, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $4, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $12, %k1, %k1
+; KNL-NEXT:    kshiftlw $13, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k0, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $13, %k1, %k1
+; KNL-NEXT:    kshiftlw $14, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k1
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $3, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $13, %k1, %k2
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    korw %k1, %k0, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftrw $14, %k1, %k2
+; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $14, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k0, %k3, %k3
-; KNL-NEXT:    kshiftrw $2, %k3, %k4
-; KNL-NEXT:    kxorw %k2, %k4, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k3, %k2
-; KNL-NEXT:    kshiftrw $3, %k2, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $12, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $4, %k2, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $11, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $5, %k2, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $10, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $6, %k2, %k3
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $9, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $7, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $2, %k1, %k1
+; KNL-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k3, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $3, %k1, %k1
+; KNL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k4, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $4, %k1, %k1
+; KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k5, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $5, %k1, %k1
+; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $6, %k1, %k1
+; KNL-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    korw %k1, %k7, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $9, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $8, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $8, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $9, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $10, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $11, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $4, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $12, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k0, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $13, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $14, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $8, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $8, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    korw %k1, %k0, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $7, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $6, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $1, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $5, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k3, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $3, %k1, %k1
+; KNL-NEXT:    korw %k1, %k4, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $4, %k1, %k1
+; KNL-NEXT:    korw %k1, %k5, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $5, %k1, %k1
+; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $6, %k1, %k1
+; KNL-NEXT:    korw %k1, %k7, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $9, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $8, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $8, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k3, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $9, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k4, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $10, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k5, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $11, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k7, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $4, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $12, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k7, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k0, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $13, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k7, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $14, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    korw %k1, %k7, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $4, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    korw %k1, %k0, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $3, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; KNL-NEXT:    korw %k0, %k1, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $2, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $14, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $15, %k7, %k7
+; KNL-NEXT:    korw %k0, %k7, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $2, %k7, %k7
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; KNL-NEXT:    korw %k7, %k1, %k7
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $14, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $1, %k2, %k2
-; KNL-NEXT:    kshiftrw $1, %k2, %k2
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $3, %k7, %k7
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; KNL-NEXT:    korw %k7, %k1, %k7
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $4, %k7, %k7
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; KNL-NEXT:    korw %k7, %k1, %k7
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $5, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k7
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $6, %k7, %k7
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; KNL-NEXT:    korw %k7, %k1, %k7
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $7, %k7, %k7
+; KNL-NEXT:    korw %k7, %k2, %k7
+; KNL-NEXT:    kshiftlw $9, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $8, %k7, %k7
+; KNL-NEXT:    korw %k7, %k3, %k7
+; KNL-NEXT:    kshiftlw $8, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k4, %k7
+; KNL-NEXT:    kshiftlw $7, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k5, %k6
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $11, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k6, %k2, %k5
+; KNL-NEXT:    kshiftlw $5, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    korw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kshiftlw $12, %k5, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k5, %k2, %k4
+; KNL-NEXT:    kshiftlw $4, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    korw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    kshiftlw $13, %k4, %k4
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    korw %k4, %k2, %k3
+; KNL-NEXT:    kshiftlw $3, %k0, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    korw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
+; KNL-NEXT:    kshiftlw $14, %k3, %k3
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
 ; KNL-NEXT:    korw %k3, %k2, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k0, %k4, %k4
-; KNL-NEXT:    kshiftrw $2, %k4, %k5
-; KNL-NEXT:    kxorw %k3, %k5, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $13, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k4, %k3
-; KNL-NEXT:    kshiftrw $3, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $12, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $4, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $11, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $5, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $10, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $6, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $9, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $7, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $8, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $8, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $7, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $9, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $6, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $10, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $5, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $11, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $4, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $12, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $3, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $13, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $2, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $14, %k3, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $14, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $1, %k3, %k3
-; KNL-NEXT:    kshiftrw $1, %k3, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    korw %k4, %k3, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k0, %k5, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k5
-; KNL-NEXT:    kxorw %k4, %k5, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $13, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $12, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $11, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $10, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $9, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $8, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $7, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $6, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $5, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $4, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $3, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $2, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $14, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    korw %k4, %k0, %k4
-; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k4} {z}
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    korw %k2, %k0, %k2
+; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; KNL-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; KNL-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; KNL-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
 ; KNL-NEXT:    vpmovdw %zmm4, %ymm4
 ; KNL-NEXT:    vpand %ymm1, %ymm4, %ymm1
-; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k3} {z}
-; KNL-NEXT:    vpmovdw %zmm4, %ymm4
+; KNL-NEXT:    vpmovdw %zmm5, %ymm4
 ; KNL-NEXT:    vpand %ymm2, %ymm4, %ymm2
-; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
-; KNL-NEXT:    vpmovdw %zmm4, %ymm4
+; KNL-NEXT:    vpmovdw %zmm6, %ymm4
 ; KNL-NEXT:    vpand %ymm3, %ymm4, %ymm3
-; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; KNL-NEXT:    vpmovdw %zmm4, %ymm4
+; KNL-NEXT:    vpmovdw %zmm7, %ymm4
 ; KNL-NEXT:    vpand %ymm0, %ymm4, %ymm0
 ; KNL-NEXT:    retq
 ;
@@ -2304,410 +2389,495 @@ define <64 x i16> @test21(<64 x i16> %x
 ;
 ; AVX512DQNOBW-LABEL: test21:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    kmovw %edx, %k0
-; AVX512DQNOBW-NEXT:    kmovw %edi, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k0, %k3, %k0
+; AVX512DQNOBW-NEXT:    kmovw %edi, %k0
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $2, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kmovw %esi, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $3, %k0, %k3
+; AVX512DQNOBW-NEXT:    kmovw %edx, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $2, %k1, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k3, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $13, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k0
-; AVX512DQNOBW-NEXT:    kxorw %k0, %k2, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %ecx, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %r8d, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %r9d, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k0, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k0, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $4, %k0, %k4
+; AVX512DQNOBW-NEXT:    kmovw %ecx, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $3, %k1, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k4, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $5, %k0, %k5
+; AVX512DQNOBW-NEXT:    kmovw %r8d, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $4, %k1, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k5, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $11, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $6, %k0, %k6
+; AVX512DQNOBW-NEXT:    kmovw %r9d, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $5, %k1, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k6, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $7, %k0, %k7
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $6, %k1, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k7, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $9, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $7, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $8, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $8, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $9, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $9, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $10, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $10, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $11, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $5, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $11, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $12, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $4, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $12, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $13, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $3, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $13, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k1, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k0, %k1, %k0
+; AVX512DQNOBW-NEXT:    korw %k0, %k2, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k0, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k2
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $2, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k2, %k3, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $13, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $3, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k2, %k4, %k2
 ; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $4, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k2, %k5, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $11, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k4, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512DQNOBW-NEXT:    kxorw %k2, %k3, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $5, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k6, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $6, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $9, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $8, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $9, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $10, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $5, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $11, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $4, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $12, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $3, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $13, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k2, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
 ; AVX512DQNOBW-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k2, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    korw %k3, %k2, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k5, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k3, %k3
-; AVX512DQNOBW-NEXT:    kxorw %k3, %k4, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $2, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k3, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $13, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $3, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k4, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $4, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k5, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $11, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $5, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k6, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $6, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $9, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $8, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k3, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $9, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k4, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $10, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k5, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $5, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $11, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $4, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $12, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $3, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $13, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $1, %k3, %k3
-; AVX512DQNOBW-NEXT:    kshiftrw $1, %k3, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    korw %k4, %k3, %k3
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k1, %k5, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k1, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k5, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $2, %k7, %k7
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $3, %k7, %k7
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $13, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $4, %k7, %k7
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $12, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $5, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $11, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $6, %k7, %k7
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $10, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $7, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k1, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $9, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $8, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k3, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $8, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $9, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k4, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $10, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k5, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $6, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k6, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $11, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k6, %k1, %k5
+; AVX512DQNOBW-NEXT:    kshiftlw $5, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k5, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $12, %k5, %k5
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k5, %k1, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $4, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k4, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $13, %k4, %k4
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k4, %k1, %k3
+; AVX512DQNOBW-NEXT:    kshiftlw $3, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k4, %k4
-; AVX512DQNOBW-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    korw %k3, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $2, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
 ; AVX512DQNOBW-NEXT:    kshiftlw $1, %k1, %k1
 ; AVX512DQNOBW-NEXT:    kshiftrw $1, %k1, %k1
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQNOBW-NEXT:    korw %k4, %k1, %k1
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k1
 ; AVX512DQNOBW-NEXT:    vpmovm2d %k1, %zmm4
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    vpmovm2d %k0, %zmm5
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    vpmovm2d %k0, %zmm6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    vpmovm2d %k0, %zmm7
 ; AVX512DQNOBW-NEXT:    vpmovdw %zmm4, %ymm4
 ; AVX512DQNOBW-NEXT:    vpand %ymm1, %ymm4, %ymm1
-; AVX512DQNOBW-NEXT:    vpmovm2d %k3, %zmm4
-; AVX512DQNOBW-NEXT:    vpmovdw %zmm4, %ymm4
+; AVX512DQNOBW-NEXT:    vpmovdw %zmm5, %ymm4
 ; AVX512DQNOBW-NEXT:    vpand %ymm2, %ymm4, %ymm2
-; AVX512DQNOBW-NEXT:    vpmovm2d %k2, %zmm4
-; AVX512DQNOBW-NEXT:    vpmovdw %zmm4, %ymm4
+; AVX512DQNOBW-NEXT:    vpmovdw %zmm6, %ymm4
 ; AVX512DQNOBW-NEXT:    vpand %ymm3, %ymm4, %ymm3
-; AVX512DQNOBW-NEXT:    vpmovm2d %k0, %zmm4
-; AVX512DQNOBW-NEXT:    vpmovdw %zmm4, %ymm4
+; AVX512DQNOBW-NEXT:    vpmovdw %zmm7, %ymm4
 ; AVX512DQNOBW-NEXT:    vpand %ymm0, %ymm4, %ymm0
 ; AVX512DQNOBW-NEXT:    retq
   %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=373495&r1=373494&r2=373495&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Wed Oct  2 10:47:09 2019
@@ -302,12 +302,15 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movb (%rdi), %al
 ; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftrw $10, %k0, %k2
-; KNL-NEXT:    kxorw %k1, %k2, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $5, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k1
+; KNL-NEXT:    kshiftlw $11, %k1, %k1
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $5, %k2, %k2
+; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; KNL-NEXT:    retq
@@ -316,11 +319,14 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovb (%rdi), %k0
 ; SKX-NEXT:    kmovd %esi, %k1
-; SKX-NEXT:    kshiftrw $10, %k1, %k2
-; SKX-NEXT:    kxorw %k0, %k2, %k0
+; SKX-NEXT:    kshiftrw $11, %k1, %k2
+; SKX-NEXT:    kshiftlw $11, %k2, %k2
+; SKX-NEXT:    kshiftlw $6, %k1, %k1
+; SKX-NEXT:    kshiftrw $6, %k1, %k1
 ; SKX-NEXT:    kshiftlw $15, %k0, %k0
 ; SKX-NEXT:    kshiftrw $5, %k0, %k0
-; SKX-NEXT:    kxorw %k0, %k1, %k0
+; SKX-NEXT:    korw %k0, %k2, %k0
+; SKX-NEXT:    korw %k0, %k1, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq
@@ -336,12 +342,15 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movb (%rdi), %al
 ; KNL-NEXT:    kmovw %esi, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k1
+; KNL-NEXT:    kshiftlw $5, %k1, %k1
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    kxorw %k1, %k2, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $11, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    korw %k0, %k1, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def $al killed $al killed $eax
 ; KNL-NEXT:    retq
@@ -350,11 +359,14 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovb (%rdi), %k0
 ; SKX-NEXT:    kmovd %esi, %k1
-; SKX-NEXT:    kshiftrb $4, %k1, %k2
-; SKX-NEXT:    kxorb %k0, %k2, %k0
+; SKX-NEXT:    kshiftrb $5, %k1, %k2
+; SKX-NEXT:    kshiftlb $5, %k2, %k2
+; SKX-NEXT:    kshiftlb $4, %k1, %k1
+; SKX-NEXT:    kshiftrb $4, %k1, %k1
 ; SKX-NEXT:    kshiftlb $7, %k0, %k0
 ; SKX-NEXT:    kshiftrb $3, %k0, %k0
-; SKX-NEXT:    kxorb %k0, %k1, %k0
+; SKX-NEXT:    korb %k0, %k2, %k0
+; SKX-NEXT:    korb %k0, %k1, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq
@@ -790,12 +802,15 @@ define i32 @test_insertelement_v32i1(i32
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k1
+; KNL-NEXT:    kshiftrw $5, %k0, %k1
+; KNL-NEXT:    kshiftlw $5, %k1, %k1
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
 ; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $11, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $11, %k2, %k2
+; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    orl %ecx, %eax
 ; KNL-NEXT:    vzeroupper
@@ -808,12 +823,15 @@ define i32 @test_insertelement_v32i1(i32
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
 ; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k1
 ; SKX-NEXT:    kunpckwd %k0, %k1, %k0
-; SKX-NEXT:    kshiftrd $4, %k0, %k1
+; SKX-NEXT:    kshiftrd $5, %k0, %k1
+; SKX-NEXT:    kshiftld $5, %k1, %k1
+; SKX-NEXT:    kshiftld $28, %k0, %k0
+; SKX-NEXT:    kshiftrd $28, %k0, %k0
 ; SKX-NEXT:    kmovd %eax, %k2
-; SKX-NEXT:    kxord %k2, %k1, %k1
-; SKX-NEXT:    kshiftld $31, %k1, %k1
-; SKX-NEXT:    kshiftrd $27, %k1, %k1
-; SKX-NEXT:    kxord %k1, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k2, %k2
+; SKX-NEXT:    kshiftrd $27, %k2, %k2
+; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -832,12 +850,15 @@ define i8 @test_iinsertelement_v4i1(i32
 ; KNL-NEXT:    cmpl %esi, %edi
 ; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k1
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftrw $3, %k0, %k1
+; KNL-NEXT:    kshiftlw $3, %k1, %k1
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $13, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    korw %k0, %k1, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
@@ -848,12 +869,15 @@ define i8 @test_iinsertelement_v4i1(i32
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k0
-; SKX-NEXT:    kshiftrb $2, %k0, %k1
-; SKX-NEXT:    kmovd %eax, %k2
-; SKX-NEXT:    kxorb %k2, %k1, %k1
+; SKX-NEXT:    kshiftrb $3, %k0, %k1
+; SKX-NEXT:    kshiftlb $3, %k1, %k1
+; SKX-NEXT:    kshiftlb $6, %k0, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
+; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
-; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    korw %k0, %k1, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=373495&r1=373494&r2=373495&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Wed Oct  2 10:47:09 2019
@@ -1069,12 +1069,16 @@ define <64 x i8> @test16(i64 %x) {
 ; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kmovw %edi, %k3
-; KNL-NEXT:    kshiftrw $5, %k0, %k4
-; KNL-NEXT:    kxnorw %k0, %k0, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $10, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k4
+; KNL-NEXT:    kshiftrw $6, %k0, %k4
+; KNL-NEXT:    kshiftlw $6, %k4, %k4
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kshiftlw $15, %k5, %k5
+; KNL-NEXT:    kshiftrw $10, %k5, %k5
+; KNL-NEXT:    korw %k5, %k4, %k4
+; KNL-NEXT:    korw %k4, %k0, %k4
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
@@ -1091,24 +1095,32 @@ define <64 x i8> @test16(i64 %x) {
 ; SKX-LABEL: test16:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovq %rdi, %k0
-; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    kshiftrq $5, %k0, %k2
-; SKX-NEXT:    kxorq %k1, %k2, %k1
-; SKX-NEXT:    kshiftlq $63, %k1, %k1
-; SKX-NEXT:    kshiftrq $58, %k1, %k1
-; SKX-NEXT:    kxorq %k1, %k0, %k0
+; SKX-NEXT:    kshiftrq $6, %k0, %k1
+; SKX-NEXT:    kshiftlq $6, %k1, %k1
+; SKX-NEXT:    kshiftlq $59, %k0, %k0
+; SKX-NEXT:    kshiftrq $59, %k0, %k0
+; SKX-NEXT:    movb $1, %al
+; SKX-NEXT:    kmovd %eax, %k2
+; SKX-NEXT:    kshiftlq $63, %k2, %k2
+; SKX-NEXT:    kshiftrq $58, %k2, %k2
+; SKX-NEXT:    korq %k2, %k1, %k1
+; SKX-NEXT:    korq %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test16:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovq %rdi, %k0
-; AVX512BW-NEXT:    kxnorw %k0, %k0, %k1
-; AVX512BW-NEXT:    kshiftrq $5, %k0, %k2
-; AVX512BW-NEXT:    kxorq %k1, %k2, %k1
-; AVX512BW-NEXT:    kshiftlq $63, %k1, %k1
-; AVX512BW-NEXT:    kshiftrq $58, %k1, %k1
-; AVX512BW-NEXT:    kxorq %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
+; AVX512BW-NEXT:    kshiftlq $6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlq $59, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $59, %k0, %k0
+; AVX512BW-NEXT:    movb $1, %al
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $58, %k2, %k2
+; AVX512BW-NEXT:    korq %k2, %k1, %k1
+; AVX512BW-NEXT:    korq %k1, %k0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1116,27 +1128,31 @@ define <64 x i8> @test16(i64 %x) {
 ; AVX512DQ:       ## %bb.0:
 ; AVX512DQ-NEXT:    movq %rdi, %rax
 ; AVX512DQ-NEXT:    movl %edi, %ecx
-; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %edi, %k1
 ; AVX512DQ-NEXT:    shrq $32, %rdi
 ; AVX512DQ-NEXT:    shrq $48, %rax
 ; AVX512DQ-NEXT:    shrl $16, %ecx
-; AVX512DQ-NEXT:    kmovw %ecx, %k1
+; AVX512DQ-NEXT:    kmovw %ecx, %k0
 ; AVX512DQ-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-NEXT:    kmovw %edi, %k3
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k4
-; AVX512DQ-NEXT:    kxnorw %k0, %k0, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512DQ-NEXT:    kshiftlw $6, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $11, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512DQ-NEXT:    movb $1, %al
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512DQ-NEXT:    kshiftrw $10, %k5, %k5
+; AVX512DQ-NEXT:    korw %k5, %k4, %k4
+; AVX512DQ-NEXT:    korw %k4, %k1, %k1
 ; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovm2d %k2, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm2
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
 ; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1145,12 +1161,16 @@ define <64 x i8> @test16(i64 %x) {
 ; X86-LABEL: test16:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k0
-; X86-NEXT:    kshiftrq $5, %k0, %k1
-; X86-NEXT:    kxnorw %k0, %k0, %k2
-; X86-NEXT:    kxorq %k2, %k1, %k1
-; X86-NEXT:    kshiftlq $63, %k1, %k1
-; X86-NEXT:    kshiftrq $58, %k1, %k1
-; X86-NEXT:    kxorq %k1, %k0, %k0
+; X86-NEXT:    kshiftrq $6, %k0, %k1
+; X86-NEXT:    kshiftlq $6, %k1, %k1
+; X86-NEXT:    kshiftlq $59, %k0, %k0
+; X86-NEXT:    kshiftrq $59, %k0, %k0
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    kmovd %eax, %k2
+; X86-NEXT:    kshiftlq $63, %k2, %k2
+; X86-NEXT:    kshiftrq $58, %k2, %k2
+; X86-NEXT:    korq %k2, %k1, %k1
+; X86-NEXT:    korq %k1, %k0, %k0
 ; X86-NEXT:    vpmovm2b %k0, %zmm0
 ; X86-NEXT:    retl
   %a = bitcast i64 %x to <64 x i1>
@@ -1174,12 +1194,15 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 ; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    cmpl %edx, %esi
 ; KNL-NEXT:    setg %al
-; KNL-NEXT:    kshiftrw $5, %k0, %k4
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $10, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k0, %k4
+; KNL-NEXT:    kshiftrw $6, %k0, %k4
+; KNL-NEXT:    kshiftlw $6, %k4, %k4
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kshiftlw $15, %k5, %k5
+; KNL-NEXT:    kshiftrw $10, %k5, %k5
+; KNL-NEXT:    korw %k5, %k4, %k4
+; KNL-NEXT:    korw %k4, %k0, %k4
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
@@ -1198,12 +1221,15 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 ; SKX-NEXT:    kmovq %rdi, %k0
 ; SKX-NEXT:    cmpl %edx, %esi
 ; SKX-NEXT:    setg %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    kshiftrq $5, %k0, %k2
-; SKX-NEXT:    kxorq %k1, %k2, %k1
-; SKX-NEXT:    kshiftlq $63, %k1, %k1
-; SKX-NEXT:    kshiftrq $58, %k1, %k1
-; SKX-NEXT:    kxorq %k1, %k0, %k0
+; SKX-NEXT:    kshiftrq $6, %k0, %k1
+; SKX-NEXT:    kshiftlq $6, %k1, %k1
+; SKX-NEXT:    kshiftlq $59, %k0, %k0
+; SKX-NEXT:    kshiftrq $59, %k0, %k0
+; SKX-NEXT:    kmovd %eax, %k2
+; SKX-NEXT:    kshiftlq $63, %k2, %k2
+; SKX-NEXT:    kshiftrq $58, %k2, %k2
+; SKX-NEXT:    korq %k2, %k1, %k1
+; SKX-NEXT:    korq %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    retq
 ;
@@ -1212,12 +1238,15 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 ; AVX512BW-NEXT:    kmovq %rdi, %k0
 ; AVX512BW-NEXT:    cmpl %edx, %esi
 ; AVX512BW-NEXT:    setg %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    kshiftrq $5, %k0, %k2
-; AVX512BW-NEXT:    kxorq %k1, %k2, %k1
-; AVX512BW-NEXT:    kshiftlq $63, %k1, %k1
-; AVX512BW-NEXT:    kshiftrq $58, %k1, %k1
-; AVX512BW-NEXT:    kxorq %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
+; AVX512BW-NEXT:    kshiftlq $6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlq $59, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $59, %k0, %k0
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $58, %k2, %k2
+; AVX512BW-NEXT:    korq %k2, %k1, %k1
+; AVX512BW-NEXT:    korq %k1, %k0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1225,29 +1254,32 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 ; AVX512DQ:       ## %bb.0:
 ; AVX512DQ-NEXT:    movq %rdi, %rax
 ; AVX512DQ-NEXT:    movl %edi, %ecx
-; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %edi, %k1
 ; AVX512DQ-NEXT:    shrq $32, %rdi
 ; AVX512DQ-NEXT:    shrq $48, %rax
 ; AVX512DQ-NEXT:    shrl $16, %ecx
-; AVX512DQ-NEXT:    kmovw %ecx, %k1
+; AVX512DQ-NEXT:    kmovw %ecx, %k0
 ; AVX512DQ-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-NEXT:    kmovw %edi, %k3
 ; AVX512DQ-NEXT:    cmpl %edx, %esi
 ; AVX512DQ-NEXT:    setg %al
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512DQ-NEXT:    kshiftlw $6, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $11, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512DQ-NEXT:    kshiftrw $10, %k5, %k5
+; AVX512DQ-NEXT:    korw %k5, %k4, %k4
+; AVX512DQ-NEXT:    korw %k4, %k1, %k1
 ; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovm2d %k2, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm2
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
 ; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1259,12 +1291,15 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    setg %al
-; X86-NEXT:    kmovd %eax, %k1
-; X86-NEXT:    kshiftrq $5, %k0, %k2
-; X86-NEXT:    kxorq %k1, %k2, %k1
-; X86-NEXT:    kshiftlq $63, %k1, %k1
-; X86-NEXT:    kshiftrq $58, %k1, %k1
-; X86-NEXT:    kxorq %k1, %k0, %k0
+; X86-NEXT:    kshiftrq $6, %k0, %k1
+; X86-NEXT:    kshiftlq $6, %k1, %k1
+; X86-NEXT:    kshiftlq $59, %k0, %k0
+; X86-NEXT:    kshiftrq $59, %k0, %k0
+; X86-NEXT:    kmovd %eax, %k2
+; X86-NEXT:    kshiftlq $63, %k2, %k2
+; X86-NEXT:    kshiftrq $58, %k2, %k2
+; X86-NEXT:    korq %k2, %k1, %k1
+; X86-NEXT:    korq %k1, %k0, %k0
 ; X86-NEXT:    vpmovm2b %k0, %zmm0
 ; X86-NEXT:    retl
   %a = bitcast i64 %x to <64 x i1>
@@ -1281,10 +1316,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; KNL-NEXT:    kmovw %esi, %k1
 ; KNL-NEXT:    kshiftrw $8, %k1, %k2
 ; KNL-NEXT:    kshiftrw $9, %k1, %k1
-; KNL-NEXT:    kshiftrw $6, %k0, %k3
-; KNL-NEXT:    kxorw %k1, %k3, %k1
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k3
 ; KNL-NEXT:    kshiftlw $6, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    korw %k1, %k3, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kshiftlw $9, %k0, %k0
 ; KNL-NEXT:    kshiftrw $9, %k0, %k0
 ; KNL-NEXT:    kshiftlw $7, %k2, %k1
@@ -1301,10 +1338,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; SKX-NEXT:    kmovd %esi, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX-NEXT:    kshiftrw $9, %k1, %k1
-; SKX-NEXT:    kshiftrb $6, %k0, %k3
-; SKX-NEXT:    kxorb %k1, %k3, %k1
+; SKX-NEXT:    kshiftlb $2, %k0, %k0
+; SKX-NEXT:    kshiftrb $2, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k3
 ; SKX-NEXT:    kshiftlb $6, %k1, %k1
-; SKX-NEXT:    kxorb %k1, %k0, %k0
+; SKX-NEXT:    korb %k1, %k3, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kshiftlb $1, %k0, %k0
 ; SKX-NEXT:    kshiftrb $1, %k0, %k0
 ; SKX-NEXT:    kshiftlb $7, %k2, %k1
@@ -1318,10 +1357,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    kshiftrw $8, %k1, %k2
 ; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
-; AVX512BW-NEXT:    kshiftrw $6, %k0, %k3
-; AVX512BW-NEXT:    kxorw %k1, %k3, %k1
+; AVX512BW-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $7, %k0, %k3
 ; AVX512BW-NEXT:    kshiftlw $6, %k1, %k1
-; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    korw %k1, %k3, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kshiftlw $9, %k0, %k0
 ; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
 ; AVX512BW-NEXT:    kshiftlw $7, %k2, %k1
@@ -1337,10 +1378,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; AVX512DQ-NEXT:    kmovw %esi, %k1
 ; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
 ; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrb $6, %k0, %k3
-; AVX512DQ-NEXT:    kxorb %k1, %k3, %k1
+; AVX512DQ-NEXT:    kshiftlb $2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlb $7, %k0, %k3
 ; AVX512DQ-NEXT:    kshiftlb $6, %k1, %k1
-; AVX512DQ-NEXT:    kxorb %k1, %k0, %k0
+; AVX512DQ-NEXT:    korb %k1, %k3, %k1
+; AVX512DQ-NEXT:    korb %k1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlb $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlb $7, %k2, %k1
@@ -1357,10 +1400,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-NEXT:    kshiftrw $9, %k1, %k1
-; X86-NEXT:    kshiftrb $6, %k0, %k3
-; X86-NEXT:    kxorb %k1, %k3, %k1
+; X86-NEXT:    kshiftlb $7, %k0, %k3
+; X86-NEXT:    kshiftlb $2, %k0, %k0
+; X86-NEXT:    kshiftrb $2, %k0, %k0
 ; X86-NEXT:    kshiftlb $6, %k1, %k1
-; X86-NEXT:    kxorb %k1, %k0, %k0
+; X86-NEXT:    korb %k1, %k3, %k1
+; X86-NEXT:    korb %k1, %k0, %k0
 ; X86-NEXT:    kshiftlb $1, %k0, %k0
 ; X86-NEXT:    kshiftrb $1, %k0, %k0
 ; X86-NEXT:    kshiftlb $7, %k2, %k1
@@ -2748,403 +2793,488 @@ define void @store_64i1(<64 x i1>* %a, <
 ;
 ; KNL-LABEL: store_64i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    kmovw %ecx, %k0
-; KNL-NEXT:    kmovw %esi, %k2
-; KNL-NEXT:    kshiftlw $15, %k0, %k1
-; KNL-NEXT:    kshiftrw $14, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k3
-; KNL-NEXT:    kxorw %k0, %k3, %k0
+; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kmovw %edx, %k1
+; KNL-NEXT:    kshiftlw $1, %k1, %k1
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k0, %k3
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    kshiftlw $2, %k1, %k1
+; KNL-NEXT:    korw %k1, %k3, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
 ; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kxorw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    kmovw %r8d, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    kmovw %r9d, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $8, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k2
+; KNL-NEXT:    kshiftlw $4, %k0, %k4
+; KNL-NEXT:    kmovw %r8d, %k1
+; KNL-NEXT:    kshiftlw $3, %k1, %k1
+; KNL-NEXT:    korw %k1, %k4, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k5
+; KNL-NEXT:    kmovw %r9d, %k1
+; KNL-NEXT:    kshiftlw $4, %k1, %k1
+; KNL-NEXT:    korw %k1, %k5, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k6
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $5, %k1, %k1
+; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k7
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $7, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k2
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $6, %k1, %k1
+; KNL-NEXT:    korw %k1, %k7, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $9, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $6, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k2
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $7, %k1, %k1
+; KNL-NEXT:    kshiftlw $8, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $8, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $8, %k1, %k1
+; KNL-NEXT:    kshiftlw $9, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $9, %k1, %k1
+; KNL-NEXT:    kshiftlw $10, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $10, %k1, %k1
+; KNL-NEXT:    kshiftlw $11, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $11, %k1, %k1
+; KNL-NEXT:    kshiftlw $12, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $4, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $12, %k1, %k1
+; KNL-NEXT:    kshiftlw $13, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k0, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $13, %k1, %k1
+; KNL-NEXT:    kshiftlw $14, %k0, %k2
+; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $5, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k2
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $14, %k1, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $4, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $3, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k2
+; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k2
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $1, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $14, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $2, %k2, %k2
+; KNL-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k2, %k3, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftlw $3, %k2, %k2
+; KNL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k2, %k4, %k2
 ; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $4, %k2, %k2
+; KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k2, %k5, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kxorw %k1, %k3, %k3
-; KNL-NEXT:    kshiftrw $2, %k3, %k4
-; KNL-NEXT:    kxorw %k2, %k4, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    kxorw %k2, %k3, %k2
-; KNL-NEXT:    kshiftrw $3, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $5, %k2, %k2
+; KNL-NEXT:    korw %k2, %k6, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $12, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $4, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $9, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $11, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $5, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $7, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $8, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $10, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $6, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $8, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $9, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $7, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $9, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $8, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $8, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $10, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $7, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $11, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $4, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $6, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $12, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k0, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $5, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $13, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $4, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $14, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $3, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $2, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
-; KNL-NEXT:    kshiftrw $14, %k2, %k3
+; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $14, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k2, %k2
+; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kshiftlw $1, %k2, %k2
-; KNL-NEXT:    kshiftrw $1, %k2, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    korw %k3, %k2, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k3
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kxorw %k1, %k4, %k4
-; KNL-NEXT:    kshiftrw $2, %k4, %k5
-; KNL-NEXT:    kxorw %k3, %k5, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $13, %k3, %k3
-; KNL-NEXT:    kxorw %k3, %k4, %k3
-; KNL-NEXT:    kshiftrw $3, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $2, %k2, %k2
+; KNL-NEXT:    korw %k2, %k3, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $13, %k0, %k0
+; KNL-NEXT:    kshiftrw $13, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $12, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $4, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $3, %k2, %k2
+; KNL-NEXT:    korw %k2, %k4, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $11, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $5, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $4, %k2, %k2
+; KNL-NEXT:    korw %k2, %k5, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $11, %k0, %k0
+; KNL-NEXT:    kshiftrw $11, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $10, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $6, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $5, %k2, %k2
+; KNL-NEXT:    korw %k2, %k6, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $9, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $7, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $9, %k0, %k0
+; KNL-NEXT:    kshiftrw $9, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $8, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $8, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $7, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k1, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $8, %k0, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $7, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $9, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $8, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k3, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $6, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $10, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $9, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k4, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $5, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $11, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $10, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k5, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $5, %k0, %k0
+; KNL-NEXT:    kshiftrw $5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $4, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $12, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $11, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $4, %k0, %k0
+; KNL-NEXT:    kshiftrw $4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $3, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $13, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $12, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $3, %k0, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $2, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftrw $14, %k3, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $13, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $2, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $14, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k3, %k3
-; KNL-NEXT:    kshiftlw $1, %k3, %k3
-; KNL-NEXT:    kshiftrw $1, %k3, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $14, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    korw %k4, %k3, %k3
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $1, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; KNL-NEXT:    korw %k2, %k0, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k1, %k5, %k1
-; KNL-NEXT:    kshiftrw $2, %k1, %k5
-; KNL-NEXT:    kxorw %k4, %k5, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $13, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $3, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $15, %k7, %k7
+; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $2, %k7, %k7
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; KNL-NEXT:    korw %k7, %k0, %k7
+; KNL-NEXT:    kshiftlw $14, %k2, %k2
+; KNL-NEXT:    kshiftrw $14, %k2, %k2
+; KNL-NEXT:    korw %k7, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $12, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $4, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $3, %k7, %k7
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; KNL-NEXT:    korw %k7, %k0, %k7
+; KNL-NEXT:    kshiftlw $13, %k2, %k2
+; KNL-NEXT:    kshiftrw $13, %k2, %k2
+; KNL-NEXT:    korw %k7, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $11, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $5, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $4, %k7, %k7
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; KNL-NEXT:    korw %k7, %k0, %k7
+; KNL-NEXT:    kshiftlw $12, %k2, %k2
+; KNL-NEXT:    kshiftrw $12, %k2, %k2
+; KNL-NEXT:    korw %k7, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $10, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $6, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $5, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k7
+; KNL-NEXT:    kshiftlw $11, %k2, %k2
+; KNL-NEXT:    kshiftrw $11, %k2, %k2
+; KNL-NEXT:    korw %k7, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $9, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $7, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $6, %k7, %k7
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; KNL-NEXT:    korw %k7, %k0, %k7
+; KNL-NEXT:    kshiftlw $10, %k2, %k2
+; KNL-NEXT:    kshiftrw $10, %k2, %k2
+; KNL-NEXT:    korw %k7, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $8, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $8, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $7, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k7
+; KNL-NEXT:    kshiftlw $9, %k2, %k2
+; KNL-NEXT:    kshiftrw $9, %k2, %k2
+; KNL-NEXT:    korw %k7, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $7, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $9, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $8, %k7, %k7
+; KNL-NEXT:    korw %k7, %k3, %k7
+; KNL-NEXT:    kshiftlw $8, %k2, %k2
+; KNL-NEXT:    kshiftrw $8, %k2, %k2
+; KNL-NEXT:    korw %k7, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $6, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $10, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k4, %k7
+; KNL-NEXT:    kshiftlw $7, %k2, %k2
+; KNL-NEXT:    kshiftrw $7, %k2, %k2
+; KNL-NEXT:    korw %k7, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $5, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $11, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k5, %k6
+; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    kshiftrw $6, %k2, %k2
+; KNL-NEXT:    korw %k6, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $4, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $12, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $11, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k6, %k1, %k5
+; KNL-NEXT:    kshiftlw $5, %k2, %k2
+; KNL-NEXT:    kshiftrw $5, %k2, %k2
+; KNL-NEXT:    korw %k5, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $3, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $13, %k1, %k4
+; KNL-NEXT:    kshiftlw $12, %k5, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k5, %k1, %k4
+; KNL-NEXT:    kshiftlw $4, %k2, %k2
+; KNL-NEXT:    kshiftrw $4, %k2, %k2
+; KNL-NEXT:    korw %k4, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $2, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
-; KNL-NEXT:    kshiftrw $14, %k1, %k4
+; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    kshiftlw $13, %k4, %k4
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k4, %k1, %k3
+; KNL-NEXT:    kshiftlw $3, %k2, %k2
+; KNL-NEXT:    kshiftrw $3, %k2, %k2
+; KNL-NEXT:    korw %k3, %k2, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kxorw %k5, %k4, %k4
-; KNL-NEXT:    kshiftlw $14, %k4, %k4
-; KNL-NEXT:    kxorw %k4, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k3
+; KNL-NEXT:    kshiftlw $14, %k3, %k3
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    korw %k3, %k1, %k1
+; KNL-NEXT:    kshiftlw $2, %k2, %k2
+; KNL-NEXT:    kshiftrw $2, %k2, %k2
+; KNL-NEXT:    korw %k1, %k2, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
 ; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    korw %k4, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, 6(%rdi)
-; KNL-NEXT:    kmovw %k3, 4(%rdi)
-; KNL-NEXT:    kmovw %k2, 2(%rdi)
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; KNL-NEXT:    kmovw %k0, 4(%rdi)
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; KNL-NEXT:    kmovw %k0, 2(%rdi)
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
 ; KNL-NEXT:    kmovw %k0, (%rdi)
 ; KNL-NEXT:    retq
 ;
@@ -3166,403 +3296,488 @@ define void @store_64i1(<64 x i1>* %a, <
 ;
 ; AVX512DQ-LABEL: store_64i1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    kmovw %ecx, %k0
-; AVX512DQ-NEXT:    kmovw %esi, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512DQ-NEXT:    kxorw %k0, %k3, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k0
 ; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kmovw %edx, %k1
+; AVX512DQ-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k3
+; AVX512DQ-NEXT:    kmovw %ecx, %k1
+; AVX512DQ-NEXT:    kshiftlw $2, %k1, %k1
+; AVX512DQ-NEXT:    korw %k1, %k3, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k0
-; AVX512DQ-NEXT:    kxorw %k0, %k2, %k0
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %r8d, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %r9d, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %r8d, %k1
+; AVX512DQ-NEXT:    kshiftlw $3, %k1, %k1
+; AVX512DQ-NEXT:    korw %k1, %k4, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k5
+; AVX512DQ-NEXT:    kmovw %r9d, %k1
+; AVX512DQ-NEXT:    kshiftlw $4, %k1, %k1
+; AVX512DQ-NEXT:    korw %k1, %k5, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k6
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $5, %k1, %k1
+; AVX512DQ-NEXT:    korw %k1, %k6, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k7
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $6, %k1, %k1
+; AVX512DQ-NEXT:    korw %k1, %k7, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $7, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $8, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $9, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $10, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $11, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $12, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $13, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlw $14, %k1, %k0
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k0, %k1, %k0
+; AVX512DQ-NEXT:    korw %k0, %k2, %k0
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %eax, %k0
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $2, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k2, %k3, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $3, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k2, %k4, %k2
 ; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $4, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k2, %k5, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kxorw %k1, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512DQ-NEXT:    kxorw %k2, %k4, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512DQ-NEXT:    kxorw %k2, %k3, %k2
-; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $5, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k6, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $6, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    korw %k2, %k7, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $8, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $9, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $10, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $11, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $12, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $13, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k0
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $14, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    korw %k3, %k2, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k3
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kxorw %k1, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512DQ-NEXT:    kxorw %k3, %k5, %k3
-; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k3
-; AVX512DQ-NEXT:    kxorw %k3, %k4, %k3
-; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $2, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k3, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $12, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $3, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k4, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $11, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $4, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k5, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $5, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k6, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $9, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $6, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k7, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $8, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k1, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $7, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $8, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k3, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $6, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $9, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k4, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $5, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $10, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k5, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $4, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $11, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k7, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $3, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $12, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k7, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $2, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $13, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k7, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $14, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kshiftlw $1, %k3, %k3
-; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k7, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    korw %k4, %k3, %k3
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k2, %k0, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k1, %k5, %k1
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k5
-; AVX512DQ-NEXT:    kxorw %k4, %k5, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $13, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $15, %k7, %k7
+; AVX512DQ-NEXT:    korw %k2, %k7, %k2
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $2, %k7, %k7
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k7, %k0, %k7
+; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512DQ-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $12, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $3, %k7, %k7
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k7, %k0, %k7
+; AVX512DQ-NEXT:    kshiftlw $13, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512DQ-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $11, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $4, %k7, %k7
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k7, %k0, %k7
+; AVX512DQ-NEXT:    kshiftlw $12, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512DQ-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $5, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k6, %k7
+; AVX512DQ-NEXT:    kshiftlw $11, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512DQ-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $9, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $6, %k7, %k7
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k7, %k0, %k7
+; AVX512DQ-NEXT:    kshiftlw $10, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512DQ-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $8, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $7, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k1, %k7
+; AVX512DQ-NEXT:    kshiftlw $9, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k2
+; AVX512DQ-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $7, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $8, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k3, %k7
+; AVX512DQ-NEXT:    kshiftlw $8, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k2
+; AVX512DQ-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $6, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $9, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k4, %k7
+; AVX512DQ-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512DQ-NEXT:    korw %k7, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $5, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $10, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k5, %k6
+; AVX512DQ-NEXT:    kshiftlw $6, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k2
+; AVX512DQ-NEXT:    korw %k6, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $4, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $11, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k6, %k1, %k5
+; AVX512DQ-NEXT:    kshiftlw $5, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k2
+; AVX512DQ-NEXT:    korw %k5, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $3, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512DQ-NEXT:    kshiftlw $12, %k5, %k5
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k5, %k1, %k4
+; AVX512DQ-NEXT:    kshiftlw $4, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512DQ-NEXT:    korw %k4, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    kshiftrw $2, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k4
+; AVX512DQ-NEXT:    kshiftlw $13, %k4, %k4
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k4, %k1, %k3
+; AVX512DQ-NEXT:    kshiftlw $3, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k2
+; AVX512DQ-NEXT:    korw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kxorw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $14, %k4, %k4
-; AVX512DQ-NEXT:    kxorw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %eax, %k3
+; AVX512DQ-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    korw %k3, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $2, %k2, %k2
+; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512DQ-NEXT:    korw %k1, %k2, %k1
 ; AVX512DQ-NEXT:    kshiftlw $1, %k1, %k1
 ; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k1
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512DQ-NEXT:    korw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k1, %k1
 ; AVX512DQ-NEXT:    kmovw %k1, 6(%rdi)
-; AVX512DQ-NEXT:    kmovw %k3, 4(%rdi)
-; AVX512DQ-NEXT:    kmovw %k2, 2(%rdi)
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; AVX512DQ-NEXT:    kmovw %k0, 4(%rdi)
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rdi)
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
 ; AVX512DQ-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/masked_store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_store.ll?rev=373495&r1=373494&r2=373495&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_store.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_store.ll Wed Oct  2 10:47:09 2019
@@ -4913,24 +4913,30 @@ define void @widen_masked_store(<3 x i32
 ; AVX512F-LABEL: widen_masked_store:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    kmovw %edx, %k0
 ; AVX512F-NEXT:    andl $1, %esi
-; AVX512F-NEXT:    kmovw %esi, %k1
-; AVX512F-NEXT:    kxorw %k0, %k0, %k2
-; AVX512F-NEXT:    kshiftrw $1, %k2, %k2
-; AVX512F-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512F-NEXT:    korw %k1, %k2, %k1
-; AVX512F-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512F-NEXT:    kxorw %k0, %k2, %k0
+; AVX512F-NEXT:    kmovw %esi, %k0
+; AVX512F-NEXT:    kxorw %k0, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512F-NEXT:    korw %k0, %k1, %k0
+; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-NEXT:    kshiftlw $2, %k1, %k1
 ; AVX512F-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512F-NEXT:    kmovw %edx, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512F-NEXT:    kshiftlw $3, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512F-NEXT:    kxorw %k0, %k1, %k0
-; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-NEXT:    kmovw %ecx, %k2
-; AVX512F-NEXT:    kxorw %k2, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw %ecx, %k1
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    korw %k0, %k1, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
@@ -4939,48 +4945,60 @@ define void @widen_masked_store(<3 x i32
 ;
 ; AVX512VLDQ-LABEL: widen_masked_store:
 ; AVX512VLDQ:       ## %bb.0:
-; AVX512VLDQ-NEXT:    kmovw %edx, %k0
-; AVX512VLDQ-NEXT:    kmovw %esi, %k1
-; AVX512VLDQ-NEXT:    kshiftlb $7, %k1, %k1
-; AVX512VLDQ-NEXT:    kshiftrb $7, %k1, %k1
-; AVX512VLDQ-NEXT:    kxorw %k0, %k0, %k2
-; AVX512VLDQ-NEXT:    kshiftrb $1, %k2, %k2
-; AVX512VLDQ-NEXT:    kshiftlb $1, %k2, %k2
-; AVX512VLDQ-NEXT:    korb %k1, %k2, %k1
-; AVX512VLDQ-NEXT:    kshiftrb $1, %k1, %k2
-; AVX512VLDQ-NEXT:    kxorb %k0, %k2, %k0
+; AVX512VLDQ-NEXT:    kmovw %esi, %k0
 ; AVX512VLDQ-NEXT:    kshiftlb $7, %k0, %k0
-; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k0
-; AVX512VLDQ-NEXT:    kxorb %k0, %k1, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kxorw %k0, %k0, %k1
+; AVX512VLDQ-NEXT:    kshiftrb $1, %k1, %k1
+; AVX512VLDQ-NEXT:    kshiftlb $1, %k1, %k1
+; AVX512VLDQ-NEXT:    korb %k0, %k1, %k0
 ; AVX512VLDQ-NEXT:    kshiftrb $2, %k0, %k1
-; AVX512VLDQ-NEXT:    kmovw %ecx, %k2
-; AVX512VLDQ-NEXT:    kxorb %k2, %k1, %k1
+; AVX512VLDQ-NEXT:    kshiftlb $2, %k1, %k1
+; AVX512VLDQ-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %edx, %k2
+; AVX512VLDQ-NEXT:    kshiftlb $7, %k2, %k2
+; AVX512VLDQ-NEXT:    kshiftrb $6, %k2, %k2
+; AVX512VLDQ-NEXT:    korb %k2, %k1, %k1
+; AVX512VLDQ-NEXT:    korb %k1, %k0, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $3, %k0, %k1
+; AVX512VLDQ-NEXT:    kshiftlb $3, %k1, %k1
+; AVX512VLDQ-NEXT:    kshiftlb $6, %k0, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k0
+; AVX512VLDQ-NEXT:    korw %k1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %ecx, %k1
 ; AVX512VLDQ-NEXT:    kshiftlb $7, %k1, %k1
 ; AVX512VLDQ-NEXT:    kshiftrb $5, %k1, %k1
-; AVX512VLDQ-NEXT:    kxorw %k1, %k0, %k1
+; AVX512VLDQ-NEXT:    korw %k0, %k1, %k1
 ; AVX512VLDQ-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
 ; AVX512VLDQ-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: widen_masked_store:
 ; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    kmovd %edx, %k0
 ; AVX512VLBW-NEXT:    andl $1, %esi
-; AVX512VLBW-NEXT:    kmovw %esi, %k1
-; AVX512VLBW-NEXT:    kxorw %k0, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftrw $1, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k1, %k2, %k1
-; AVX512VLBW-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512VLBW-NEXT:    kxorw %k0, %k2, %k0
+; AVX512VLBW-NEXT:    kmovw %esi, %k0
+; AVX512VLBW-NEXT:    kxorw %k0, %k0, %k1
+; AVX512VLBW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512VLBW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512VLBW-NEXT:    korw %k0, %k1, %k0
+; AVX512VLBW-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512VLBW-NEXT:    kshiftlw $2, %k1, %k1
 ; AVX512VLBW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512VLBW-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLBW-NEXT:    kmovd %edx, %k2
+; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512VLBW-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
+; AVX512VLBW-NEXT:    korw %k1, %k0, %k0
+; AVX512VLBW-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512VLBW-NEXT:    kshiftlw $3, %k1, %k1
+; AVX512VLBW-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512VLBW-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512VLBW-NEXT:    kxorw %k0, %k1, %k0
-; AVX512VLBW-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512VLBW-NEXT:    kmovd %ecx, %k2
-; AVX512VLBW-NEXT:    kxorw %k2, %k1, %k1
+; AVX512VLBW-NEXT:    korw %k1, %k0, %k0
+; AVX512VLBW-NEXT:    kmovd %ecx, %k1
 ; AVX512VLBW-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512VLBW-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512VLBW-NEXT:    kxorw %k1, %k0, %k1
+; AVX512VLBW-NEXT:    korw %k0, %k1, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
 ; AVX512VLBW-NEXT:    retq
   call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)

Modified: llvm/trunk/test/CodeGen/X86/vec_smulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_smulo.ll?rev=373495&r1=373494&r2=373495&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_smulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_smulo.ll Wed Oct  2 10:47:09 2019
@@ -1730,20 +1730,26 @@ define <2 x i32> @smulo_v2i64(<2 x i64>
 ;
 ; AVX512-LABEL: smulo_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovq %xmm1, %rax
-; AVX512-NEXT:    vmovq %xmm0, %rcx
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    vmovq %xmm1, %rdx
+; AVX512-NEXT:    vmovq %xmm0, %rsi
 ; AVX512-NEXT:    imulq %rdx, %rsi
-; AVX512-NEXT:    vmovq %rsi, %xmm0
+; AVX512-NEXT:    seto %dl
 ; AVX512-NEXT:    imulq %rax, %rcx
-; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    vmovq %rcx, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftlw $15, %k0, %k1
-; AVX512-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512-NEXT:    kmovd %edx, %k1
+; AVX512-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512-NEXT:    kshiftlw $2, %k0, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    korw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
@@ -2197,46 +2203,76 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a
 ;
 ; AVX512-LABEL: smulo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k0
 ; AVX512-NEXT:    kshiftrw $3, %k0, %k1
 ; AVX512-NEXT:    kmovd %k1, %r9d
 ; AVX512-NEXT:    andb $1, %r9b
 ; AVX512-NEXT:    negb %r9b
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    kshiftrw $3, %k1, %k2
 ; AVX512-NEXT:    kmovd %k2, %r10d
 ; AVX512-NEXT:    andb $1, %r10b
 ; AVX512-NEXT:    negb %r10b
 ; AVX512-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512-NEXT:    kmovd %k1, %ecx
-; AVX512-NEXT:    andb $1, %cl
-; AVX512-NEXT:    negb %cl
-; AVX512-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512-NEXT:    kmovd %k0, %esi
+; AVX512-NEXT:    kmovd %k2, %r11d
+; AVX512-NEXT:    andb $1, %r11b
+; AVX512-NEXT:    negb %r11b
+; AVX512-NEXT:    kshiftrw $2, %k0, %k2
+; AVX512-NEXT:    kmovd %k2, %ebx
+; AVX512-NEXT:    andb $1, %bl
+; AVX512-NEXT:    negb %bl
+; AVX512-NEXT:    kshiftrw $1, %k0, %k2
+; AVX512-NEXT:    kmovd %k2, %esi
 ; AVX512-NEXT:    andb $1, %sil
 ; AVX512-NEXT:    negb %sil
-; AVX512-NEXT:    kmovd %k1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    negb %al
+; AVX512-NEXT:    kshiftrw $1, %k1, %k2
 ; AVX512-NEXT:    kmovd %k2, %edx
 ; AVX512-NEXT:    andb $1, %dl
 ; AVX512-NEXT:    negb %dl
+; AVX512-NEXT:    kmovd %k1, %eax
+; AVX512-NEXT:    andb $1, %al
+; AVX512-NEXT:    negb %al
+; AVX512-NEXT:    kmovd %k0, %ecx
+; AVX512-NEXT:    andb $1, %cl
+; AVX512-NEXT:    negb %cl
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    imulb %dl
+; AVX512-NEXT:    imulb %cl
 ; AVX512-NEXT:    movl %eax, %r8d
 ; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    movl %r8d, %edx
-; AVX512-NEXT:    andb $1, %dl
-; AVX512-NEXT:    negb %dl
-; AVX512-NEXT:    cmpb %r8b, %dl
-; AVX512-NEXT:    setne %dl
-; AVX512-NEXT:    orb %al, %dl
+; AVX512-NEXT:    movl %r8d, %ecx
+; AVX512-NEXT:    andb $1, %cl
+; AVX512-NEXT:    negb %cl
+; AVX512-NEXT:    cmpb %r8b, %cl
+; AVX512-NEXT:    setne %cl
+; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    movl %esi, %eax
-; AVX512-NEXT:    imulb %cl
+; AVX512-NEXT:    kmovd %eax, %k0
+; AVX512-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512-NEXT:    movl %edx, %eax
+; AVX512-NEXT:    imulb %sil
+; AVX512-NEXT:    movl %eax, %edx
+; AVX512-NEXT:    seto %al
+; AVX512-NEXT:    movl %edx, %ecx
+; AVX512-NEXT:    andb $1, %cl
+; AVX512-NEXT:    negb %cl
+; AVX512-NEXT:    cmpb %dl, %cl
+; AVX512-NEXT:    setne %cl
+; AVX512-NEXT:    orb %al, %cl
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    kmovd %eax, %k2
+; AVX512-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k0, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512-NEXT:    kshiftlw $3, %k0, %k2
+; AVX512-NEXT:    movl %r11d, %eax
+; AVX512-NEXT:    imulb %bl
 ; AVX512-NEXT:    movl %eax, %esi
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    movl %esi, %ecx
@@ -2246,26 +2282,22 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512-NEXT:    kxorw %k0, %k2, %k2
-; AVX512-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512-NEXT:    kxorw %k1, %k3, %k1
-; AVX512-NEXT:    kshiftlw $2, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k2, %k1
+; AVX512-NEXT:    kmovd %eax, %k3
+; AVX512-NEXT:    kshiftlw $2, %k3, %k3
+; AVX512-NEXT:    korw %k3, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
 ; AVX512-NEXT:    kshiftlw $13, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512-NEXT:    movl %r9d, %eax
-; AVX512-NEXT:    imulb %r10b
+; AVX512-NEXT:    movl %r10d, %eax
+; AVX512-NEXT:    imulb %r9b
 ; AVX512-NEXT:    # kill: def $al killed $al def $eax
 ; AVX512-NEXT:    seto %cl
-; AVX512-NEXT:    movl %eax, %edx
-; AVX512-NEXT:    andb $1, %dl
-; AVX512-NEXT:    negb %dl
-; AVX512-NEXT:    cmpb %al, %dl
-; AVX512-NEXT:    setne %dl
-; AVX512-NEXT:    orb %cl, %dl
+; AVX512-NEXT:    movl %eax, %ebx
+; AVX512-NEXT:    andb $1, %bl
+; AVX512-NEXT:    negb %bl
+; AVX512-NEXT:    cmpb %al, %bl
+; AVX512-NEXT:    setne %bl
+; AVX512-NEXT:    orb %cl, %bl
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    kmovd %ecx, %k2
 ; AVX512-NEXT:    kshiftlw $3, %k2, %k2
@@ -2273,21 +2305,34 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    kmovd %r8d, %k1
-; AVX512-NEXT:    kmovd %esi, %k2
-; AVX512-NEXT:    kxorw %k0, %k2, %k0
-; AVX512-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512-NEXT:    kxorw %k1, %k2, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
+; AVX512-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512-NEXT:    kmovd %edx, %k2
+; AVX512-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k0, %k0
+; AVX512-NEXT:    korw %k0, %k1, %k0
 ; AVX512-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kshiftlw $3, %k1, %k1
+; AVX512-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512-NEXT:    kmovd %esi, %k2
+; AVX512-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    korw %k1, %k0, %k0
+; AVX512-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512-NEXT:    kshiftlw $4, %k1, %k1
+; AVX512-NEXT:    kshiftlw $13, %k0, %k0
+; AVX512-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512-NEXT:    korw %k1, %k0, %k0
+; AVX512-NEXT:    kmovd %eax, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $12, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
+; AVX512-NEXT:    korw %k0, %k1, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
+; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    retq
   %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
   %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0

Modified: llvm/trunk/test/CodeGen/X86/vec_umulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_umulo.ll?rev=373495&r1=373494&r2=373495&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_umulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_umulo.ll Wed Oct  2 10:47:09 2019
@@ -1532,21 +1532,28 @@ define <2 x i32> @umulo_v2i64(<2 x i64>
 ;
 ; AVX512-LABEL: umulo_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovq %xmm0, %rcx
-; AVX512-NEXT:    vmovq %xmm1, %rsi
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm1, %rdx
 ; AVX512-NEXT:    mulq %rdx
-; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    movq %rax, %rsi
+; AVX512-NEXT:    seto %r9b
 ; AVX512-NEXT:    movq %rcx, %rax
-; AVX512-NEXT:    mulq %rsi
-; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    mulq %r8
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftlw $15, %k0, %k1
-; AVX512-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512-NEXT:    kmovd %r9d, %k1
+; AVX512-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512-NEXT:    kshiftlw $2, %k0, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    korw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
@@ -1945,6 +1952,7 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a
 ;
 ; AVX512-LABEL: umulo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbx
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; AVX512-NEXT:    kshiftrw $3, %k0, %k1
@@ -1956,40 +1964,60 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    kmovd %k2, %r10d
 ; AVX512-NEXT:    andb $1, %r10b
 ; AVX512-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512-NEXT:    kmovd %k0, %esi
+; AVX512-NEXT:    kmovd %k2, %r11d
+; AVX512-NEXT:    andb $1, %r11b
+; AVX512-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512-NEXT:    kmovd %k2, %ebx
+; AVX512-NEXT:    andb $1, %bl
+; AVX512-NEXT:    kshiftrw $1, %k0, %k2
+; AVX512-NEXT:    kmovd %k2, %edx
+; AVX512-NEXT:    andb $1, %dl
+; AVX512-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512-NEXT:    kmovd %k2, %esi
 ; AVX512-NEXT:    andb $1, %sil
-; AVX512-NEXT:    kshiftrw $2, %k1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    andb $1, %al
 ; AVX512-NEXT:    kmovd %k1, %ecx
 ; AVX512-NEXT:    andb $1, %cl
-; AVX512-NEXT:    kmovd %k2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    kmovd %k0, %edx
-; AVX512-NEXT:    andb $1, %dl
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    mulb %dl
+; AVX512-NEXT:    mulb %cl
 ; AVX512-NEXT:    movl %eax, %r8d
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    testb $-2, %r8b
-; AVX512-NEXT:    setne %dl
-; AVX512-NEXT:    orb %al, %dl
+; AVX512-NEXT:    setne %cl
+; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    movl %esi, %eax
-; AVX512-NEXT:    mulb %cl
+; AVX512-NEXT:    kmovd %eax, %k0
+; AVX512-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512-NEXT:    movl %edx, %eax
+; AVX512-NEXT:    mulb %sil
+; AVX512-NEXT:    movl %eax, %edx
+; AVX512-NEXT:    seto %al
+; AVX512-NEXT:    testb $-2, %dl
+; AVX512-NEXT:    setne %cl
+; AVX512-NEXT:    orb %al, %cl
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    kmovd %eax, %k2
+; AVX512-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k0, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512-NEXT:    kshiftlw $3, %k0, %k2
+; AVX512-NEXT:    movl %r11d, %eax
+; AVX512-NEXT:    mulb %bl
 ; AVX512-NEXT:    movl %eax, %esi
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    testb $-2, %sil
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512-NEXT:    kxorw %k0, %k2, %k2
-; AVX512-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512-NEXT:    kxorw %k1, %k3, %k1
-; AVX512-NEXT:    kshiftlw $2, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k2, %k1
+; AVX512-NEXT:    kmovd %eax, %k3
+; AVX512-NEXT:    kshiftlw $2, %k3, %k3
+; AVX512-NEXT:    korw %k3, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
 ; AVX512-NEXT:    kshiftlw $13, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $13, %k1, %k1
 ; AVX512-NEXT:    movl %r9d, %eax
@@ -1997,8 +2025,8 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    # kill: def $al killed $al def $eax
 ; AVX512-NEXT:    seto %cl
 ; AVX512-NEXT:    testb $-2, %al
-; AVX512-NEXT:    setne %dl
-; AVX512-NEXT:    orb %cl, %dl
+; AVX512-NEXT:    setne %bl
+; AVX512-NEXT:    orb %cl, %bl
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    kmovd %ecx, %k2
 ; AVX512-NEXT:    kshiftlw $3, %k2, %k2
@@ -2006,21 +2034,34 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    kmovd %r8d, %k1
-; AVX512-NEXT:    kmovd %esi, %k2
-; AVX512-NEXT:    kxorw %k0, %k2, %k0
-; AVX512-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512-NEXT:    kxorw %k1, %k2, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
+; AVX512-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512-NEXT:    kmovd %edx, %k2
+; AVX512-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k0, %k0
+; AVX512-NEXT:    korw %k0, %k1, %k0
 ; AVX512-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kshiftlw $3, %k1, %k1
+; AVX512-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512-NEXT:    kmovd %esi, %k2
+; AVX512-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    korw %k1, %k0, %k0
+; AVX512-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512-NEXT:    kshiftlw $4, %k1, %k1
+; AVX512-NEXT:    kshiftlw $13, %k0, %k0
+; AVX512-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512-NEXT:    korw %k1, %k0, %k0
+; AVX512-NEXT:    kmovd %eax, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $12, %k1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
+; AVX512-NEXT:    korw %k0, %k1, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
+; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    retq
   %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
   %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0




More information about the llvm-commits mailing list