[llvm] 9dc9e0e - [X86] Optimization of inserting vxi1 sub vector into vXi1 vector

via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 2 17:26:23 PST 2020


Author: Wang, Pengfei
Date: 2020-01-03T09:25:25+08:00
New Revision: 9dc9e0ea64f507488b5ca9cd656311db94433201

URL: https://github.com/llvm/llvm-project/commit/9dc9e0ea64f507488b5ca9cd656311db94433201
DIFF: https://github.com/llvm/llvm-project/commit/9dc9e0ea64f507488b5ca9cd656311db94433201.diff

LOG: [X86] Optimization of inserting vxi1 sub vector into vXi1 vector

Summary:
After bugfix the undef value case here, we used more operations to implement inserting vxi1 sub vector into vXi1 vector, I optimize it by use less operations.

The history information at https://reviews.llvm.org/D68311

Reviewers: craig.topper, LuoYuanke, yubing, annita.zhang, pengfei, LiuChen3, RKSimon

Reviewed By: craig.topper

Subscribers: hiraditya, llvm-commits

Patch by Xiang Zhang (xiangzhangllvm)

Differential Revision: https://reviews.llvm.org/D71917

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/avx512-calling-conv.ll
    llvm/test/CodeGen/X86/avx512-ext.ll
    llvm/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/test/CodeGen/X86/avx512-mask-op.ll
    llvm/test/CodeGen/X86/masked_store.ll
    llvm/test/CodeGen/X86/min-legal-vector-width.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/vec_umulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d7287238185f..e22be2c1bdfc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5916,11 +5916,29 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   // Widen the vector if needed.
   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
 
-  // Clear the upper bits of the subvector and move it to its insert position.
   unsigned ShiftLeft = NumElems - SubVecNumElems;
+  unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+
+  // Do an optimization for the the most frequently used types.
+  if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
+    APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
+    Mask0.flipAllBits();
+    SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
+    SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
+    Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
+    SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+                         DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
+    SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+                         DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+    Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
+
+    // Reduce to original width if needed.
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+  }
+
+  // Clear the upper bits of the subvector and move it to its insert position.
   SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
                        DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
-  unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
   SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
                        DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
 

diff  --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index b13c27e0d470..f07067f97650 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -531,246 +531,228 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    pushq %r12
 ; KNL-NEXT:    pushq %rbx
 ; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    movw $-3, %di
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movw $-5, %di
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $2, %k1, %k1
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movw $-9, %di
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $3, %k1, %k1
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $12, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movw $-17, %di
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $4, %k1, %k1
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $11, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movw $-33, %di
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $5, %k1, %k1
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $10, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movw $-65, %di
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $6, %k1, %k1
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $9, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $9, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    movw $-129, %di
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $7, %k1, %k1
-; KNL-NEXT:    kshiftlw $8, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $8, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $8, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    movw $-257, %di ## imm = 0xFEFF
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $8, %k1, %k1
-; KNL-NEXT:    kshiftlw $9, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $7, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    movw $-513, %di ## imm = 0xFDFF
+; KNL-NEXT:    kmovw %edi, %k5
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $9, %k1, %k1
-; KNL-NEXT:    kshiftlw $10, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $6, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    movw $-1025, %di ## imm = 0xFBFF
+; KNL-NEXT:    kmovw %edi, %k4
+; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $10, %k1, %k1
-; KNL-NEXT:    kshiftlw $11, %k0, %k6
-; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $5, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    movw $-2049, %di ## imm = 0xF7FF
+; KNL-NEXT:    kmovw %edi, %k3
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $11, %k1, %k1
-; KNL-NEXT:    kshiftlw $12, %k0, %k5
-; KNL-NEXT:    korw %k1, %k5, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $4, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    movw $-4097, %di ## imm = 0xEFFF
+; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $12, %k1, %k1
-; KNL-NEXT:    kshiftlw $13, %k0, %k4
-; KNL-NEXT:    korw %k1, %k4, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k2
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $3, %k1, %k1
+; KNL-NEXT:    korw %k1, %k0, %k1
+; KNL-NEXT:    movw $-8193, %di ## imm = 0xDFFF
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kandw %k0, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $2, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k6
+; KNL-NEXT:    movw $-16385, %di ## imm = 0xBFFF
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $13, %k1, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k3
-; KNL-NEXT:    korw %k0, %k3, %k0
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k2
+; KNL-NEXT:    kandw %k1, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k1
-; KNL-NEXT:    korw %k0, %k1, %k0
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kshiftlw $1, %k6, %k6
+; KNL-NEXT:    kshiftrw $1, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kmovw %esi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $2, %k2, %k2
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kmovw %esi, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %r8d, %k2
-; KNL-NEXT:    kshiftlw $3, %k2, %k2
+; KNL-NEXT:    kandw %k7, %k6, %k6
+; KNL-NEXT:    kmovw %edx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %r9d, %k2
-; KNL-NEXT:    kshiftlw $4, %k2, %k2
+; KNL-NEXT:    kandw %k7, %k6, %k6
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $5, %k2, %k2
+; KNL-NEXT:    kandw %k7, %k6, %k6
+; KNL-NEXT:    kmovw %r8d, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    kandw %k7, %k6, %k6
+; KNL-NEXT:    kmovw %r9d, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kandw %k7, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $7, %k2, %k2
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    kshiftlw $9, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kandw %k7, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $8, %k2, %k2
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    kshiftlw $8, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kandw %k7, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $9, %k2, %k2
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $8, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kandw %k7, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $10, %k2, %k2
-; KNL-NEXT:    korw %k2, %k6, %k2
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $7, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kandw %k5, %k6, %k5
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $6, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kandw %k4, %k5, %k4
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k5
+; KNL-NEXT:    kshiftlw $15, %k5, %k5
+; KNL-NEXT:    kshiftrw $5, %k5, %k5
+; KNL-NEXT:    korw %k5, %k4, %k4
+; KNL-NEXT:    kandw %k3, %k4, %k3
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $4, %k4, %k4
+; KNL-NEXT:    korw %k4, %k3, %k3
+; KNL-NEXT:    kandw %k2, %k3, %k2
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k3
+; KNL-NEXT:    kshiftlw $15, %k3, %k3
+; KNL-NEXT:    kshiftrw $3, %k3, %k3
+; KNL-NEXT:    korw %k3, %k2, %k2
+; KNL-NEXT:    kandw %k0, %k2, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $11, %k2, %k2
-; KNL-NEXT:    korw %k2, %k5, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $2, %k2, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    xorl %ecx, %ecx
 ; KNL-NEXT:    testb $1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    movl $65535, %edx ## imm = 0xFFFF
 ; KNL-NEXT:    movl $0, %esi
 ; KNL-NEXT:    cmovnel %edx, %esi
-; KNL-NEXT:    kshiftlw $5, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $12, %k2, %k2
-; KNL-NEXT:    korw %k2, %k4, %k2
 ; KNL-NEXT:    testb $1, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    kshiftlw $4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $13, %k2, %k2
-; KNL-NEXT:    korw %k2, %k3, %k2
 ; KNL-NEXT:    cmovnel %edx, %ecx
-; KNL-NEXT:    kshiftlw $3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k2
-; KNL-NEXT:    kshiftlw $14, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kshiftlw $2, %k0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    kmovw %edx, %k1
+; KNL-NEXT:    kshiftlw $14, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
@@ -877,294 +859,225 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    pushq %r13
 ; SKX-NEXT:    pushq %r12
 ; SKX-NEXT:    pushq %rbx
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
 ; SKX-NEXT:    movq %rdi, %rax
-; SKX-NEXT:    kshiftld $31, %k0, %k0
-; SKX-NEXT:    kshiftrd $31, %k0, %k1
-; SKX-NEXT:    kshiftld $2, %k0, %k0
-; SKX-NEXT:    kord %k0, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $30, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $3, %k1, %k2
-; SKX-NEXT:    kshiftld $3, %k2, %k2
-; SKX-NEXT:    kshiftld $30, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    movl $-3, %edi
+; SKX-NEXT:    kmovd %edi, %k2
+; SKX-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k2, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $30, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $29, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $4, %k1, %k2
-; SKX-NEXT:    kshiftld $4, %k2, %k2
-; SKX-NEXT:    kshiftld $29, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-5, %edi
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $29, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $28, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $5, %k1, %k2
-; SKX-NEXT:    kshiftld $5, %k2, %k2
-; SKX-NEXT:    kshiftld $28, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-9, %edi
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $28, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $27, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $6, %k1, %k2
-; SKX-NEXT:    kshiftld $6, %k2, %k2
-; SKX-NEXT:    kshiftld $27, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    movl $-17, %edi
+; SKX-NEXT:    kmovd %edi, %k2
+; SKX-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k2, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $27, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $26, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $7, %k1, %k2
-; SKX-NEXT:    kshiftld $7, %k2, %k2
-; SKX-NEXT:    kshiftld $26, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-33, %edi
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $26, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $25, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $8, %k1, %k2
-; SKX-NEXT:    kshiftld $8, %k2, %k2
-; SKX-NEXT:    kshiftld $25, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-65, %edi
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $25, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $24, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $9, %k1, %k2
-; SKX-NEXT:    kshiftld $9, %k2, %k2
-; SKX-NEXT:    kshiftld $24, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    movl $-129, %edi
+; SKX-NEXT:    kmovd %edi, %k2
+; SKX-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k2, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $24, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-257, %edi ## imm = 0xFEFF
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $23, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-513, %edi ## imm = 0xFDFF
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $22, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    movl $-1025, %edi ## imm = 0xFBFF
+; SKX-NEXT:    kmovd %edi, %k6
+; SKX-NEXT:    kandd %k6, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $21, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-2049, %edi ## imm = 0xF7FF
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $20, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-4097, %edi ## imm = 0xEFFF
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $19, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    movl $-8193, %edi ## imm = 0xDFFF
+; SKX-NEXT:    kmovd %edi, %k4
+; SKX-NEXT:    kandd %k4, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $18, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-16385, %edi ## imm = 0xBFFF
+; SKX-NEXT:    kmovd %edi, %k5
+; SKX-NEXT:    kandd %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $17, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    movl $-32769, %edi ## imm = 0xFFFF7FFF
+; SKX-NEXT:    kmovd %edi, %k3
+; SKX-NEXT:    kandd %k3, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT:    kshiftld $31, %k7, %k7
+; SKX-NEXT:    kshiftrd $16, %k7, %k7
+; SKX-NEXT:    kord %k7, %k0, %k7
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    movl $-65537, %edi ## imm = 0xFFFEFFFF
+; SKX-NEXT:    kmovd %edi, %k2
+; SKX-NEXT:    kandd %k2, %k7, %k7
+; SKX-NEXT:    kshiftld $31, %k0, %k0
+; SKX-NEXT:    kshiftrd $15, %k0, %k0
+; SKX-NEXT:    kord %k0, %k7, %k0
+; SKX-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kmovd %esi, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovd %edx, %k7
+; SKX-NEXT:    kshiftld $31, %k7, %k7
+; SKX-NEXT:    kshiftrd $30, %k7, %k7
+; SKX-NEXT:    kord %k7, %k0, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovd %ecx, %k7
+; SKX-NEXT:    kshiftld $31, %k7, %k7
+; SKX-NEXT:    kshiftrd $29, %k7, %k7
+; SKX-NEXT:    kord %k7, %k0, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovd %r8d, %k7
+; SKX-NEXT:    kshiftld $31, %k7, %k7
+; SKX-NEXT:    kshiftrd $28, %k7, %k7
+; SKX-NEXT:    kord %k7, %k0, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovd %r9d, %k7
+; SKX-NEXT:    kshiftld $31, %k7, %k7
+; SKX-NEXT:    kshiftrd $27, %k7, %k7
+; SKX-NEXT:    kord %k7, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k1
+; SKX-NEXT:    kshiftld $31, %k7, %k7
+; SKX-NEXT:    kshiftrd $26, %k7, %k7
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    kord %k7, %k1, %k1
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload
+; SKX-NEXT:    kandd %k7, %k1, %k1
+; SKX-NEXT:    kshiftld $31, %k0, %k0
+; SKX-NEXT:    kshiftrd $25, %k0, %k0
+; SKX-NEXT:    kord %k0, %k1, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $24, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $23, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $10, %k1, %k2
-; SKX-NEXT:    kshiftld $10, %k2, %k2
-; SKX-NEXT:    kshiftld $23, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $23, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $22, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $11, %k1, %k2
-; SKX-NEXT:    kshiftld $11, %k2, %k2
-; SKX-NEXT:    kshiftld $22, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $22, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $21, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $12, %k1, %k2
-; SKX-NEXT:    kshiftld $12, %k2, %k2
-; SKX-NEXT:    kshiftld $21, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kandd %k6, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $21, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $20, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $13, %k1, %k2
-; SKX-NEXT:    kshiftld $13, %k2, %k2
-; SKX-NEXT:    kshiftld $20, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 4-byte Reload
+; SKX-NEXT:    kandd %k6, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $20, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $19, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $14, %k1, %k2
-; SKX-NEXT:    kshiftld $14, %k2, %k2
-; SKX-NEXT:    kshiftld $19, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k6, %k1
 ; SKX-NEXT:    kshiftrd $19, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $18, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $15, %k1, %k2
-; SKX-NEXT:    kshiftld $15, %k2, %k2
-; SKX-NEXT:    kshiftld $18, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kandd %k4, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $18, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $17, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $16, %k1, %k2
-; SKX-NEXT:    kshiftld $16, %k2, %k2
-; SKX-NEXT:    kshiftld $17, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kandd %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $17, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $16, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kshiftrd $17, %k1, %k2
-; SKX-NEXT:    kshiftld $17, %k2, %k2
-; SKX-NEXT:    kshiftld $16, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kandd %k3, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $16, %k1, %k1
-; SKX-NEXT:    kord %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $15, %k2, %k2
-; SKX-NEXT:    kord %k1, %k2, %k1
-; SKX-NEXT:    kmovd %esi, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $31, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kmovd %edx, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $30, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $3, %k0, %k2
-; SKX-NEXT:    kshiftld $3, %k2, %k2
-; SKX-NEXT:    kshiftld $30, %k0, %k0
-; SKX-NEXT:    kshiftrd $30, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovd %ecx, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $29, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $4, %k0, %k2
-; SKX-NEXT:    kshiftld $4, %k2, %k2
-; SKX-NEXT:    kshiftld $29, %k0, %k0
-; SKX-NEXT:    kshiftrd $29, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovd %r8d, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $28, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $5, %k0, %k2
-; SKX-NEXT:    kshiftld $5, %k2, %k2
-; SKX-NEXT:    kshiftld $28, %k0, %k0
-; SKX-NEXT:    kshiftrd $28, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovd %r9d, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $27, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $6, %k0, %k2
-; SKX-NEXT:    kshiftld $6, %k2, %k2
-; SKX-NEXT:    kshiftld $27, %k0, %k0
-; SKX-NEXT:    kshiftrd $27, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $26, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $7, %k0, %k2
-; SKX-NEXT:    kshiftld $7, %k2, %k2
-; SKX-NEXT:    kshiftld $26, %k0, %k0
-; SKX-NEXT:    kshiftrd $26, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $25, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $8, %k0, %k2
-; SKX-NEXT:    kshiftld $8, %k2, %k2
-; SKX-NEXT:    kshiftld $25, %k0, %k0
-; SKX-NEXT:    kshiftrd $25, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $24, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $9, %k0, %k2
-; SKX-NEXT:    kshiftld $9, %k2, %k2
-; SKX-NEXT:    kshiftld $24, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftrd $24, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $23, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $10, %k0, %k2
-; SKX-NEXT:    kshiftld $10, %k2, %k2
-; SKX-NEXT:    kshiftld $23, %k0, %k0
-; SKX-NEXT:    kshiftrd $23, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $22, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $11, %k0, %k2
-; SKX-NEXT:    kshiftld $11, %k2, %k2
-; SKX-NEXT:    kshiftld $22, %k0, %k0
-; SKX-NEXT:    kshiftrd $22, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $21, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $12, %k0, %k2
-; SKX-NEXT:    kshiftld $12, %k2, %k2
-; SKX-NEXT:    kshiftld $21, %k0, %k0
-; SKX-NEXT:    kshiftrd $21, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $20, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $13, %k0, %k2
-; SKX-NEXT:    kshiftld $13, %k2, %k2
-; SKX-NEXT:    kshiftld $20, %k0, %k0
-; SKX-NEXT:    kshiftrd $20, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $19, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $14, %k0, %k2
-; SKX-NEXT:    kshiftld $14, %k2, %k2
-; SKX-NEXT:    kshiftld $19, %k0, %k0
-; SKX-NEXT:    kshiftrd $19, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $18, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $15, %k0, %k2
-; SKX-NEXT:    kshiftld $15, %k2, %k2
-; SKX-NEXT:    kshiftld $18, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftrd $18, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $17, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $16, %k0, %k2
-; SKX-NEXT:    kshiftld $16, %k2, %k2
-; SKX-NEXT:    kshiftld $17, %k0, %k0
-; SKX-NEXT:    kshiftrd $17, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kshiftld $31, %k3, %k2
-; SKX-NEXT:    kshiftrd $16, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
-; SKX-NEXT:    kshiftrd $17, %k0, %k2
-; SKX-NEXT:    kshiftld $17, %k2, %k2
-; SKX-NEXT:    kshiftld $16, %k0, %k0
-; SKX-NEXT:    kshiftrd $16, %k0, %k0
-; SKX-NEXT:    kord %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $15, %k2, %k2
-; SKX-NEXT:    kord %k0, %k2, %k0
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kandd %k2, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $15, %k1, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
 ; SKX-NEXT:    kandd %k1, %k0, %k0
 ; SKX-NEXT:    kshiftrd $16, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r8d
@@ -1260,249 +1173,231 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    pushl %edi
 ; KNL_X32-NEXT:    pushl %esi
 ; KNL_X32-NEXT:    subl $20, %esp
+; KNL_X32-NEXT:    movw $-3, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $2, %k0, %k2
-; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $1, %k1, %k1
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $14, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $3, %k0, %k2
-; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    movw $-5, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $2, %k1, %k1
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $4, %k0, %k2
-; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    movw $-9, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $3, %k1, %k1
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $12, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $5, %k0, %k2
-; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    movw $-17, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $4, %k1, %k1
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $11, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $6, %k0, %k2
-; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    movw $-33, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $5, %k1, %k1
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $10, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $7, %k0, %k2
-; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    movw $-65, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $6, %k1, %k1
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $9, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $9, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $9, %k0, %k0
+; KNL_X32-NEXT:    movw $-129, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $7, %k1, %k1
-; KNL_X32-NEXT:    kshiftlw $8, %k0, %k2
-; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $8, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $8, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $8, %k0, %k0
+; KNL_X32-NEXT:    movw $-257, %ax ## imm = 0xFEFF
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $8, %k1, %k1
-; KNL_X32-NEXT:    kshiftlw $9, %k0, %k2
-; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $7, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $7, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k0
+; KNL_X32-NEXT:    movw $-513, %ax ## imm = 0xFDFF
+; KNL_X32-NEXT:    kmovw %eax, %k5
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $9, %k1, %k1
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k2
-; KNL_X32-NEXT:    kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $6, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $6, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k0
+; KNL_X32-NEXT:    movw $-1025, %ax ## imm = 0xFBFF
+; KNL_X32-NEXT:    kmovw %eax, %k4
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $10, %k1, %k1
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k6
-; KNL_X32-NEXT:    korw %k1, %k6, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $5, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $5, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k0
+; KNL_X32-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
+; KNL_X32-NEXT:    kmovw %eax, %k3
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $11, %k1, %k1
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k5
-; KNL_X32-NEXT:    korw %k1, %k5, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $4, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $4, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k0
+; KNL_X32-NEXT:    movw $-4097, %ax ## imm = 0xEFFF
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $12, %k1, %k1
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k4
-; KNL_X32-NEXT:    korw %k1, %k4, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $3, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $3, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    movw $-8193, %ax ## imm = 0xDFFF
 ; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kshiftlw $13, %k1, %k0
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k3
-; KNL_X32-NEXT:    korw %k0, %k3, %k0
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftlw $2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $2, %k0, %k2
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $2, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k6
+; KNL_X32-NEXT:    movw $-16385, %ax ## imm = 0xBFFF
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k1
-; KNL_X32-NEXT:    korw %k0, %k1, %k0
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k0, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, (%esp) ## 2-byte Spill
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kshiftlw $1, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $1, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kmovw %k6, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $15, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k7, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $2, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
 ; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    korw %k2, %k7, %k2
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kandw %k7, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
 ; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    korw %k2, %k7, %k2
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kandw %k7, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
 ; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    korw %k2, %k7, %k2
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kandw %k7, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
 ; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    korw %k2, %k7, %k2
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kandw %k7, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
 ; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    korw %k2, %k7, %k2
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kandw %k7, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
 ; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    korw %k2, %k7, %k2
-; KNL_X32-NEXT:    kshiftlw $9, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $9, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kandw %k7, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $8, %k2, %k2
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $8, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
 ; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    korw %k2, %k7, %k2
-; KNL_X32-NEXT:    kshiftlw $8, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $8, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kandw %k7, %k6, %k6
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $9, %k2, %k2
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    korw %k2, %k7, %k2
-; KNL_X32-NEXT:    kshiftlw $7, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $7, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k5, %k6, %k5
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $10, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k6, %k2
-; KNL_X32-NEXT:    kshiftlw $6, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $6, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kandw %k4, %k5, %k4
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $11, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k5, %k2
-; KNL_X32-NEXT:    kshiftlw $5, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k5
+; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
+; KNL_X32-NEXT:    kshiftrw $5, %k5, %k5
+; KNL_X32-NEXT:    korw %k5, %k4, %k4
+; KNL_X32-NEXT:    kandw %k3, %k4, %k3
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $12, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k4, %k2
-; KNL_X32-NEXT:    kshiftlw $4, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k4
+; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
+; KNL_X32-NEXT:    kshiftrw $4, %k4, %k4
+; KNL_X32-NEXT:    korw %k4, %k3, %k3
+; KNL_X32-NEXT:    kandw %k2, %k3, %k2
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $13, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k3, %k2
-; KNL_X32-NEXT:    kshiftlw $3, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k3
+; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
+; KNL_X32-NEXT:    kshiftrw $3, %k3, %k3
+; KNL_X32-NEXT:    korw %k3, %k2, %k2
+; KNL_X32-NEXT:    kandw %k1, %k2, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $14, %k2, %k2
+; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftrw $2, %k2, %k2
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
 ; KNL_X32-NEXT:    xorl %eax, %eax
 ; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    movl $65535, %ecx ## imm = 0xFFFF
 ; KNL_X32-NEXT:    movl $0, %edx
 ; KNL_X32-NEXT:    cmovnel %ecx, %edx
-; KNL_X32-NEXT:    kshiftlw $2, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $2, %k0, %k0
+; KNL_X32-NEXT:    kandw %k0, %k1, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; KNL_X32-NEXT:    kmovw %ebx, %k1
+; KNL_X32-NEXT:    kshiftlw $14, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
 ; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL_X32-NEXT:    kshiftrw $1, %k0, %k0
@@ -1513,7 +1408,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    cmovnel %ecx, %eax
-; KNL_X32-NEXT:    kmovw (%esp), %k2 ## 2-byte Reload
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload
 ; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kandw %k1, %k2, %k1
@@ -1610,550 +1505,373 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL-LABEL: test17:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $14, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    kshiftlw $3, %k2, %k2
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    kshiftlw $4, %k2, %k2
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    movw $-3, %di
 ; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    kshiftlw $5, %k2, %k2
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k2
-; KNL-NEXT:    kshiftlw $6, %k2, %k2
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k2
-; KNL-NEXT:    kshiftlw $7, %k2, %k2
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $14, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    kshiftlw $3, %k2, %k2
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    kshiftlw $4, %k2, %k2
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    kshiftlw $5, %k2, %k2
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k2
-; KNL-NEXT:    kshiftlw $6, %k2, %k2
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k2
-; KNL-NEXT:    kshiftlw $7, %k2, %k2
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $14, %k3, %k3
-; KNL-NEXT:    korw %k0, %k3, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k3
-; KNL-NEXT:    kshiftlw $3, %k3, %k3
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    korw %k3, %k0, %k0
+; KNL-NEXT:    movw $-5, %di
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $13, %k3, %k3
-; KNL-NEXT:    korw %k0, %k3, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k3
-; KNL-NEXT:    kshiftlw $4, %k3, %k3
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    korw %k3, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $12, %k3, %k3
-; KNL-NEXT:    korw %k0, %k3, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k3
-; KNL-NEXT:    kshiftlw $5, %k3, %k3
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k3, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $11, %k3, %k3
-; KNL-NEXT:    korw %k0, %k3, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k3
-; KNL-NEXT:    kshiftlw $6, %k3, %k3
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    korw %k3, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $10, %k3, %k3
-; KNL-NEXT:    korw %k0, %k3, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k3
-; KNL-NEXT:    kshiftlw $7, %k3, %k3
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
 ; KNL-NEXT:    korw %k3, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    movw $-9, %di
 ; KNL-NEXT:    kmovw %edi, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $9, %k3, %k3
-; KNL-NEXT:    korw %k0, %k3, %k3
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $14, %k4, %k4
-; KNL-NEXT:    korw %k0, %k4, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k4
-; KNL-NEXT:    kshiftlw $3, %k4, %k4
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    korw %k4, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $13, %k4, %k4
-; KNL-NEXT:    korw %k0, %k4, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k4
-; KNL-NEXT:    kshiftlw $4, %k4, %k4
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    korw %k4, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    kshiftrw $12, %k4, %k4
-; KNL-NEXT:    korw %k0, %k4, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k4
-; KNL-NEXT:    kshiftlw $5, %k4, %k4
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k4, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $11, %k4, %k4
-; KNL-NEXT:    korw %k0, %k4, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k4
-; KNL-NEXT:    kshiftlw $6, %k4, %k4
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    korw %k4, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $10, %k4, %k4
-; KNL-NEXT:    korw %k0, %k4, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k4
-; KNL-NEXT:    kshiftlw $7, %k4, %k4
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
 ; KNL-NEXT:    korw %k4, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    movw $-17, %di
 ; KNL-NEXT:    kmovw %edi, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $9, %k4, %k4
-; KNL-NEXT:    korw %k0, %k4, %k4
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k5
-; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $14, %k5, %k5
-; KNL-NEXT:    korw %k0, %k5, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k5
-; KNL-NEXT:    kshiftlw $3, %k5, %k5
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    korw %k5, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k5
-; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $13, %k5, %k5
-; KNL-NEXT:    korw %k0, %k5, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k5
-; KNL-NEXT:    kshiftlw $4, %k5, %k5
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    korw %k5, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k5
-; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $12, %k5, %k5
-; KNL-NEXT:    korw %k0, %k5, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k5
-; KNL-NEXT:    kshiftlw $5, %k5, %k5
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k5, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k5
 ; KNL-NEXT:    kshiftlw $15, %k5, %k5
 ; KNL-NEXT:    kshiftrw $11, %k5, %k5
-; KNL-NEXT:    korw %k0, %k5, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k5
-; KNL-NEXT:    kshiftlw $6, %k5, %k5
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    korw %k5, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k5
-; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $10, %k5, %k5
-; KNL-NEXT:    korw %k0, %k5, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k5
-; KNL-NEXT:    kshiftlw $7, %k5, %k5
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
 ; KNL-NEXT:    korw %k5, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    movw $-33, %di
 ; KNL-NEXT:    kmovw %edi, %k5
-; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $9, %k5, %k5
-; KNL-NEXT:    korw %k0, %k5, %k5
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k6
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $14, %k6, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k6
-; KNL-NEXT:    kshiftlw $3, %k6, %k6
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
 ; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    movw $-65, %di
 ; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $13, %k6, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k6
-; KNL-NEXT:    kshiftlw $4, %k6, %k6
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $12, %k6, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k6
-; KNL-NEXT:    kshiftlw $5, %k6, %k6
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $11, %k6, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k6
-; KNL-NEXT:    kshiftlw $6, %k6, %k6
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $10, %k6, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k6
-; KNL-NEXT:    kshiftlw $7, %k6, %k6
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $9, %k6, %k6
-; KNL-NEXT:    korw %k0, %k6, %k6
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k0, %k7, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k7
-; KNL-NEXT:    kshiftlw $3, %k7, %k7
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k0, %k7, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k7
-; KNL-NEXT:    kshiftlw $4, %k7, %k7
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
 ; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k0, %k7, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k7
-; KNL-NEXT:    kshiftlw $5, %k7, %k7
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
 ; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k0, %k7, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k7
-; KNL-NEXT:    kshiftlw $6, %k7, %k7
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
 ; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k0, %k7, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k7
-; KNL-NEXT:    kshiftlw $7, %k7, %k7
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
 ; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k0, %k7, %k7
-; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %edx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $14, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k2
-; KNL-NEXT:    kshiftlw $3, %k2, %k2
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k2
-; KNL-NEXT:    kshiftlw $4, %k2, %k2
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %r8d, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k2
-; KNL-NEXT:    kshiftlw $5, %k2, %k2
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %r9d, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k2
-; KNL-NEXT:    kshiftlw $6, %k2, %k2
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k2
-; KNL-NEXT:    kshiftlw $7, %k2, %k2
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kmovw %esi, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %edx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
+; KNL-NEXT:    kmovw %r8d, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
+; KNL-NEXT:    kmovw %r9d, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $14, %k2, %k2
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    kshiftrw $3, %k1, %k2
-; KNL-NEXT:    kshiftlw $3, %k2, %k2
-; KNL-NEXT:    kshiftlw $14, %k1, %k1
-; KNL-NEXT:    kshiftrw $14, %k1, %k1
-; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kandw %k2, %k7, %k2
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k2, %k2
+; KNL-NEXT:    kandw %k1, %k2, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    kshiftrw $4, %k1, %k2
-; KNL-NEXT:    kshiftlw $4, %k2, %k2
-; KNL-NEXT:    kshiftlw $13, %k1, %k1
-; KNL-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    kshiftrw $5, %k1, %k2
-; KNL-NEXT:    kshiftlw $5, %k2, %k2
-; KNL-NEXT:    kshiftlw $12, %k1, %k1
-; KNL-NEXT:    kshiftrw $12, %k1, %k1
 ; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    kandw %k4, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    kshiftrw $6, %k1, %k2
-; KNL-NEXT:    kshiftlw $6, %k2, %k2
-; KNL-NEXT:    kshiftlw $11, %k1, %k1
-; KNL-NEXT:    kshiftrw $11, %k1, %k1
 ; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    kandw %k5, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    kshiftrw $7, %k1, %k2
-; KNL-NEXT:    kshiftlw $7, %k2, %k2
-; KNL-NEXT:    kshiftlw $10, %k1, %k1
-; KNL-NEXT:    kshiftrw $10, %k1, %k1
 ; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    kandw %k6, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kandw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
@@ -2196,491 +1914,339 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; SKX-LABEL: test17:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movq %rdi, %rax
+; SKX-NEXT:    movb $-3, %dil
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT:    kshiftlb $7, %k0, %k0
-; SKX-NEXT:    kshiftrb $7, %k0, %k1
-; SKX-NEXT:    kshiftlb $2, %k0, %k0
-; SKX-NEXT:    korb %k0, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k2
-; SKX-NEXT:    korb %k1, %k2, %k1
-; SKX-NEXT:    kshiftrb $3, %k1, %k2
-; SKX-NEXT:    kshiftlb $3, %k2, %k2
-; SKX-NEXT:    kshiftlb $6, %k1, %k1
+; SKX-NEXT:    kandb %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $6, %k1, %k1
-; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    movb $-5, %dil
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kandb %k1, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftlb $7, %k2, %k2
 ; SKX-NEXT:    kshiftrb $5, %k2, %k2
-; SKX-NEXT:    korb %k1, %k2, %k1
-; SKX-NEXT:    kshiftrb $4, %k1, %k2
-; SKX-NEXT:    kshiftlb $4, %k2, %k2
-; SKX-NEXT:    kshiftlb $5, %k1, %k1
-; SKX-NEXT:    kshiftrb $5, %k1, %k1
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $4, %k2, %k2
-; SKX-NEXT:    korb %k1, %k2, %k1
-; SKX-NEXT:    kshiftrb $5, %k1, %k2
-; SKX-NEXT:    kshiftlb $5, %k2, %k2
-; SKX-NEXT:    kshiftlb $4, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftrb $4, %k1, %k1
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kshiftlb $7, %k3, %k2
-; SKX-NEXT:    kshiftrb $3, %k2, %k2
-; SKX-NEXT:    korb %k1, %k2, %k1
-; SKX-NEXT:    kshiftrb $6, %k1, %k2
-; SKX-NEXT:    kshiftlb $6, %k2, %k2
-; SKX-NEXT:    kshiftlb $3, %k1, %k1
-; SKX-NEXT:    kshiftrb $3, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kshiftlb $7, %k3, %k2
-; SKX-NEXT:    kshiftrb $2, %k2, %k2
-; SKX-NEXT:    korb %k1, %k2, %k1
-; SKX-NEXT:    kshiftrb $7, %k1, %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftlb $2, %k1, %k1
-; SKX-NEXT:    kshiftrb $2, %k1, %k1
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $1, %k2, %k2
-; SKX-NEXT:    korb %k1, %k2, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $7, %k2, %k2
-; SKX-NEXT:    korb %k0, %k2, %k2
+; SKX-NEXT:    korb %k2, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT:    movb $-9, %dil
+; SKX-NEXT:    kmovd %edi, %k7
+; SKX-NEXT:    kandb %k7, %k0, %k0
 ; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $6, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $3, %k2, %k3
-; SKX-NEXT:    kshiftlb $3, %k3, %k3
-; SKX-NEXT:    kshiftlb $6, %k2, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k2
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $4, %k2, %k3
-; SKX-NEXT:    kshiftlb $4, %k3, %k3
-; SKX-NEXT:    kshiftlb $5, %k2, %k2
-; SKX-NEXT:    kshiftrb $5, %k2, %k2
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $4, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $5, %k2, %k3
-; SKX-NEXT:    kshiftlb $5, %k3, %k3
-; SKX-NEXT:    kshiftlb $4, %k2, %k2
-; SKX-NEXT:    kshiftrb $4, %k2, %k2
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $3, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k3
-; SKX-NEXT:    kshiftlb $6, %k3, %k3
-; SKX-NEXT:    kshiftlb $3, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftrb $3, %k2, %k2
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kshiftlb $7, %k4, %k3
-; SKX-NEXT:    kshiftrb $2, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $7, %k2, %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftlb $2, %k2, %k2
-; SKX-NEXT:    kshiftrb $2, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kshiftlb $7, %k4, %k3
-; SKX-NEXT:    kshiftrb $1, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kandb %k1, %k2, %k1
-; SKX-NEXT:    kshiftlb $7, %k4, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftrb $7, %k2, %k2
-; SKX-NEXT:    korb %k0, %k2, %k2
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $6, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $3, %k2, %k3
-; SKX-NEXT:    kshiftlb $3, %k3, %k3
-; SKX-NEXT:    kshiftlb $6, %k2, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kshiftlb $7, %k4, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $4, %k2, %k3
-; SKX-NEXT:    kshiftlb $4, %k3, %k3
-; SKX-NEXT:    kshiftlb $5, %k2, %k2
-; SKX-NEXT:    kshiftrb $5, %k2, %k2
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $4, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $5, %k2, %k3
-; SKX-NEXT:    kshiftlb $5, %k3, %k3
-; SKX-NEXT:    kshiftlb $4, %k2, %k2
-; SKX-NEXT:    kshiftrb $4, %k2, %k2
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $3, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k3
-; SKX-NEXT:    kshiftlb $6, %k3, %k3
-; SKX-NEXT:    kshiftlb $3, %k2, %k2
-; SKX-NEXT:    kshiftrb $3, %k2, %k2
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $2, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kshiftrb $7, %k2, %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftlb $2, %k2, %k2
-; SKX-NEXT:    kshiftrb $2, %k2, %k2
-; SKX-NEXT:    korb %k3, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $1, %k3, %k3
-; SKX-NEXT:    korb %k2, %k3, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $7, %k3, %k3
-; SKX-NEXT:    korb %k0, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $3, %k3, %k4
-; SKX-NEXT:    kshiftlb $3, %k4, %k4
-; SKX-NEXT:    kshiftlb $6, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftrb $6, %k3, %k3
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kshiftlb $7, %k5, %k4
-; SKX-NEXT:    kshiftrb $5, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $4, %k3, %k4
-; SKX-NEXT:    kshiftlb $4, %k4, %k4
-; SKX-NEXT:    kshiftlb $5, %k3, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kshiftlb $7, %k5, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k4
-; SKX-NEXT:    kshiftlb $5, %k4, %k4
-; SKX-NEXT:    kshiftlb $4, %k3, %k3
-; SKX-NEXT:    kshiftrb $4, %k3, %k3
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $3, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $6, %k3, %k4
-; SKX-NEXT:    kshiftlb $6, %k4, %k4
-; SKX-NEXT:    kshiftlb $3, %k3, %k3
-; SKX-NEXT:    kshiftrb $3, %k3, %k3
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $7, %k3, %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftlb $2, %k3, %k3
-; SKX-NEXT:    kshiftrb $2, %k3, %k3
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $1, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $7, %k4, %k4
-; SKX-NEXT:    korb %k0, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $6, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $3, %k4, %k5
-; SKX-NEXT:    kshiftlb $3, %k5, %k5
-; SKX-NEXT:    kshiftlb $6, %k4, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $5, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k5
-; SKX-NEXT:    kshiftlb $4, %k5, %k5
-; SKX-NEXT:    kshiftlb $5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kshiftrb $5, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kshiftlb $7, %k6, %k5
-; SKX-NEXT:    kshiftrb $4, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $5, %k4, %k5
-; SKX-NEXT:    kshiftlb $5, %k5, %k5
-; SKX-NEXT:    kshiftlb $4, %k4, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kshiftlb $7, %k6, %k5
-; SKX-NEXT:    kshiftrb $3, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k5
-; SKX-NEXT:    kshiftlb $6, %k5, %k5
-; SKX-NEXT:    kshiftlb $3, %k4, %k4
-; SKX-NEXT:    kshiftrb $3, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $2, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $7, %k4, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftlb $2, %k4, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $1, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kandb %k3, %k4, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kandb %k2, %k3, %k2
-; SKX-NEXT:    kshiftlb $7, %k4, %k3
-; SKX-NEXT:    kshiftrb $7, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    korb %k0, %k3, %k3
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $3, %k3, %k4
-; SKX-NEXT:    kshiftlb $3, %k4, %k4
-; SKX-NEXT:    kshiftlb $6, %k3, %k3
-; SKX-NEXT:    kshiftrb $6, %k3, %k3
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $5, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $4, %k3, %k4
-; SKX-NEXT:    kshiftlb $4, %k4, %k4
-; SKX-NEXT:    kshiftlb $5, %k3, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k3
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $5, %k3, %k4
-; SKX-NEXT:    kshiftlb $5, %k4, %k4
-; SKX-NEXT:    kshiftlb $4, %k3, %k3
 ; SKX-NEXT:    kshiftrb $4, %k3, %k3
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $3, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $6, %k3, %k4
-; SKX-NEXT:    kshiftlb $6, %k4, %k4
-; SKX-NEXT:    kshiftlb $3, %k3, %k3
-; SKX-NEXT:    kshiftrb $3, %k3, %k3
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftrb $7, %k3, %k4
-; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftlb $2, %k3, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftrb $2, %k3, %k3
-; SKX-NEXT:    korb %k4, %k3, %k3
-; SKX-NEXT:    kshiftlb $7, %k5, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftrb $1, %k4, %k4
-; SKX-NEXT:    korb %k3, %k4, %k3
-; SKX-NEXT:    kshiftlb $7, %k5, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftrb $7, %k4, %k4
-; SKX-NEXT:    korb %k0, %k4, %k4
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $6, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $3, %k4, %k5
-; SKX-NEXT:    kshiftlb $3, %k5, %k5
-; SKX-NEXT:    kshiftlb $6, %k4, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kshiftlb $7, %k6, %k5
-; SKX-NEXT:    kshiftrb $5, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k5
-; SKX-NEXT:    kshiftlb $4, %k5, %k5
-; SKX-NEXT:    kshiftlb $5, %k4, %k4
-; SKX-NEXT:    kshiftrb $5, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $4, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $5, %k4, %k5
-; SKX-NEXT:    kshiftlb $5, %k5, %k5
-; SKX-NEXT:    kshiftlb $4, %k4, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $3, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k5
-; SKX-NEXT:    kshiftlb $6, %k5, %k5
-; SKX-NEXT:    kshiftlb $3, %k4, %k4
-; SKX-NEXT:    kshiftrb $3, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $2, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $7, %k4, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftlb $2, %k4, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $1, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kandb %k3, %k4, %k3
+; SKX-NEXT:    korb %k3, %k0, %k0
+; SKX-NEXT:    movb $-17, %dil
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kandb %k1, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $7, %k4, %k4
-; SKX-NEXT:    korb %k0, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $6, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $3, %k4, %k5
-; SKX-NEXT:    kshiftlb $3, %k5, %k5
-; SKX-NEXT:    kshiftlb $6, %k4, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $5, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k5
-; SKX-NEXT:    kshiftlb $4, %k5, %k5
-; SKX-NEXT:    kshiftlb $5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    kshiftrb $5, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kshiftlb $7, %k6, %k5
-; SKX-NEXT:    kshiftrb $4, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $5, %k4, %k5
-; SKX-NEXT:    kshiftlb $5, %k5, %k5
-; SKX-NEXT:    kshiftlb $4, %k4, %k4
-; SKX-NEXT:    kshiftrb $4, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kshiftlb $7, %k6, %k5
-; SKX-NEXT:    kshiftrb $3, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $6, %k4, %k5
-; SKX-NEXT:    kshiftlb $6, %k5, %k5
-; SKX-NEXT:    kshiftlb $3, %k4, %k4
 ; SKX-NEXT:    kshiftrb $3, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
+; SKX-NEXT:    korb %k4, %k0, %k0
+; SKX-NEXT:    movb $-33, %dil
+; SKX-NEXT:    kmovd %edi, %k4
+; SKX-NEXT:    kandb %k4, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
 ; SKX-NEXT:    kshiftrb $2, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kshiftrb $7, %k4, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftlb $2, %k4, %k4
-; SKX-NEXT:    kshiftrb $2, %k4, %k4
-; SKX-NEXT:    korb %k5, %k4, %k4
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $1, %k5, %k5
-; SKX-NEXT:    korb %k4, %k5, %k4
-; SKX-NEXT:    kmovd %esi, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $7, %k5, %k5
-; SKX-NEXT:    korb %k0, %k5, %k0
-; SKX-NEXT:    kmovd %edx, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $6, %k5, %k5
-; SKX-NEXT:    korb %k0, %k5, %k0
-; SKX-NEXT:    kshiftrb $3, %k0, %k5
-; SKX-NEXT:    kshiftlb $3, %k5, %k5
-; SKX-NEXT:    kshiftlb $6, %k0, %k0
-; SKX-NEXT:    kshiftrb $6, %k0, %k0
 ; SKX-NEXT:    korb %k5, %k0, %k0
-; SKX-NEXT:    kmovd %ecx, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $5, %k5, %k5
-; SKX-NEXT:    korb %k0, %k5, %k0
-; SKX-NEXT:    kshiftrb $4, %k0, %k5
-; SKX-NEXT:    kshiftlb $4, %k5, %k5
-; SKX-NEXT:    kshiftlb $5, %k0, %k0
-; SKX-NEXT:    kshiftrb $5, %k0, %k0
-; SKX-NEXT:    korb %k5, %k0, %k0
-; SKX-NEXT:    kmovd %r8d, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $4, %k5, %k5
-; SKX-NEXT:    korb %k0, %k5, %k0
-; SKX-NEXT:    kshiftrb $5, %k0, %k5
-; SKX-NEXT:    kshiftlb $5, %k5, %k5
-; SKX-NEXT:    kshiftlb $4, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT:    movb $-65, %dil
+; SKX-NEXT:    kmovd %edi, %k5
+; SKX-NEXT:    kandb %k5, %k0, %k1
+; SKX-NEXT:    kshiftlb $7, %k6, %k6
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    kshiftrb $1, %k6, %k6
+; SKX-NEXT:    korb %k6, %k1, %k1
+; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; SKX-NEXT:    kandb %k6, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; SKX-NEXT:    kandb %k3, %k0, %k2
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $5, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    korb %k1, %k2, %k1
+; SKX-NEXT:    kandb %k7, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
 ; SKX-NEXT:    kshiftrb $4, %k0, %k0
-; SKX-NEXT:    korb %k5, %k0, %k0
-; SKX-NEXT:    kmovd %r9d, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $3, %k5, %k5
-; SKX-NEXT:    korb %k0, %k5, %k0
-; SKX-NEXT:    kshiftrb $6, %k0, %k5
-; SKX-NEXT:    kshiftlb $6, %k5, %k5
-; SKX-NEXT:    kshiftlb $3, %k0, %k0
-; SKX-NEXT:    kshiftrb $3, %k0, %k0
-; SKX-NEXT:    korb %k5, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $2, %k5, %k5
-; SKX-NEXT:    korb %k0, %k5, %k0
-; SKX-NEXT:    kshiftrb $7, %k0, %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftlb $2, %k0, %k0
-; SKX-NEXT:    kshiftrb $2, %k0, %k0
-; SKX-NEXT:    korb %k5, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $1, %k5, %k5
-; SKX-NEXT:    korb %k0, %k5, %k0
+; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; SKX-NEXT:    kandb %k2, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $3, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $2, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $1, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; SKX-NEXT:    kandb %k1, %k0, %k0
+; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    kandb %k6, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $5, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k7, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $4, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kmovq %k2, %k3
 ; SKX-NEXT:    kandb %k2, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $3, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k2, %k1
+; SKX-NEXT:    kshiftrb $2, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $1, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kmovq %k6, %k0
+; SKX-NEXT:    kandb %k6, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $6, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; SKX-NEXT:    kandb %k4, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k6, %k2
+; SKX-NEXT:    kshiftrb $5, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k7, %k1, %k1
+; SKX-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $4, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k3, %k1, %k1
+; SKX-NEXT:    kmovq %k3, %k6
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $3, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; SKX-NEXT:    kandb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $2, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k5, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $1, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kandb %k0, %k1, %k1
+; SKX-NEXT:    kmovq %k0, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $6, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k4, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $5, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k7, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $4, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kandb %k6, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $3, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; SKX-NEXT:    kandb %k7, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $2, %k0, %k0
+; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $1, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; SKX-NEXT:    kandb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; SKX-NEXT:    kandb %k1, %k0, %k0
+; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $5, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $4, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k6, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $3, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kandb %k7, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k2, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftrb $1, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k3, %k2, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $6, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; SKX-NEXT:    kandb %k6, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $5, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k4, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k6, %k2
+; SKX-NEXT:    kshiftrb $4, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; SKX-NEXT:    kandb %k6, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $3, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k7, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $2, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k5, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $1, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k0, %k1, %k0
+; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $5, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $4, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kandb %k6, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $3, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k7, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k2, %k1
+; SKX-NEXT:    kshiftrb $2, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $1, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; SKX-NEXT:    kandb %k2, %k1, %k1
+; SKX-NEXT:    kmovd %edx, %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $6, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k3, %k1, %k1
+; SKX-NEXT:    kmovd %ecx, %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $5, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k5, %k1, %k1
+; SKX-NEXT:    kmovd %r8d, %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $4, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k6, %k1, %k1
+; SKX-NEXT:    kmovd %r9d, %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $3, %k2, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k7, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k3, %k2
+; SKX-NEXT:    kshiftrb $2, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k4, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $1, %k2, %k2
+; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kandb %k0, %k1, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; SKX-NEXT:    kandb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; SKX-NEXT:    kandb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
 ; SKX-NEXT:    kandb %k1, %k0, %k0
 ; SKX-NEXT:    kshiftrb $6, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r8d
@@ -2720,557 +2286,380 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL_X32-LABEL: test17:
 ; KNL_X32:       ## %bb.0:
 ; KNL_X32-NEXT:    pushl %ebx
-; KNL_X32-NEXT:    pushl %eax
+; KNL_X32-NEXT:    subl $16, %esp
+; KNL_X32-NEXT:    movw $-3, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $2, %k0, %k1
-; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $14, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $14, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    movw $-5, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k3
+; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
+; KNL_X32-NEXT:    kshiftrw $13, %k3, %k3
+; KNL_X32-NEXT:    korw %k3, %k0, %k0
+; KNL_X32-NEXT:    movw $-9, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k3
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k4
+; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
+; KNL_X32-NEXT:    kshiftrw $12, %k4, %k4
+; KNL_X32-NEXT:    korw %k4, %k0, %k0
+; KNL_X32-NEXT:    movw $-17, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k4
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k5
+; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
+; KNL_X32-NEXT:    kshiftrw $11, %k5, %k5
+; KNL_X32-NEXT:    korw %k5, %k0, %k0
+; KNL_X32-NEXT:    movw $-33, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k5
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    movw $-65, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kandw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
-; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $14, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kmovw %k0, (%esp) ## 2-byte Spill
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
-; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $14, %k3, %k3
-; KNL_X32-NEXT:    korw %k0, %k3, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k3
-; KNL_X32-NEXT:    kshiftlw $3, %k3, %k3
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
-; KNL_X32-NEXT:    korw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $13, %k3, %k3
-; KNL_X32-NEXT:    korw %k0, %k3, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k3
-; KNL_X32-NEXT:    kshiftlw $4, %k3, %k3
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    korw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $12, %k3, %k3
-; KNL_X32-NEXT:    korw %k0, %k3, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k3
-; KNL_X32-NEXT:    kshiftlw $5, %k3, %k3
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
-; KNL_X32-NEXT:    korw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $11, %k3, %k3
-; KNL_X32-NEXT:    korw %k0, %k3, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k3
-; KNL_X32-NEXT:    kshiftlw $6, %k3, %k3
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
-; KNL_X32-NEXT:    korw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $10, %k3, %k3
-; KNL_X32-NEXT:    korw %k0, %k3, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k3
-; KNL_X32-NEXT:    kshiftlw $7, %k3, %k3
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
-; KNL_X32-NEXT:    korw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $9, %k3, %k3
-; KNL_X32-NEXT:    korw %k0, %k3, %k3
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
-; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $14, %k4, %k4
-; KNL_X32-NEXT:    korw %k0, %k4, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k4
-; KNL_X32-NEXT:    kshiftlw $3, %k4, %k4
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
-; KNL_X32-NEXT:    korw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $13, %k4, %k4
-; KNL_X32-NEXT:    korw %k0, %k4, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k4
-; KNL_X32-NEXT:    kshiftlw $4, %k4, %k4
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    korw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $12, %k4, %k4
-; KNL_X32-NEXT:    korw %k0, %k4, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k4
-; KNL_X32-NEXT:    kshiftlw $5, %k4, %k4
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
-; KNL_X32-NEXT:    korw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $11, %k4, %k4
-; KNL_X32-NEXT:    korw %k0, %k4, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k4
-; KNL_X32-NEXT:    kshiftlw $6, %k4, %k4
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
-; KNL_X32-NEXT:    korw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $10, %k4, %k4
-; KNL_X32-NEXT:    korw %k0, %k4, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k4
-; KNL_X32-NEXT:    kshiftlw $7, %k4, %k4
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
-; KNL_X32-NEXT:    korw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $9, %k4, %k4
-; KNL_X32-NEXT:    korw %k0, %k4, %k4
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
-; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $14, %k5, %k5
-; KNL_X32-NEXT:    korw %k0, %k5, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k5
-; KNL_X32-NEXT:    kshiftlw $3, %k5, %k5
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
-; KNL_X32-NEXT:    korw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $13, %k5, %k5
-; KNL_X32-NEXT:    korw %k0, %k5, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k5
-; KNL_X32-NEXT:    kshiftlw $4, %k5, %k5
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    korw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $12, %k5, %k5
-; KNL_X32-NEXT:    korw %k0, %k5, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k5
-; KNL_X32-NEXT:    kshiftlw $5, %k5, %k5
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
-; KNL_X32-NEXT:    korw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $11, %k5, %k5
-; KNL_X32-NEXT:    korw %k0, %k5, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k5
-; KNL_X32-NEXT:    kshiftlw $6, %k5, %k5
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
-; KNL_X32-NEXT:    korw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $10, %k5, %k5
-; KNL_X32-NEXT:    korw %k0, %k5, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k5
-; KNL_X32-NEXT:    kshiftlw $7, %k5, %k5
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
-; KNL_X32-NEXT:    korw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $9, %k5, %k5
-; KNL_X32-NEXT:    korw %k0, %k5, %k5
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
-; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $14, %k6, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k6
-; KNL_X32-NEXT:    kshiftlw $3, %k6, %k6
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k6
-; KNL_X32-NEXT:    kshiftlw $4, %k6, %k6
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k6
-; KNL_X32-NEXT:    kshiftlw $5, %k6, %k6
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k6
-; KNL_X32-NEXT:    kshiftlw $6, %k6, %k6
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k6
-; KNL_X32-NEXT:    kshiftlw $7, %k6, %k6
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k6
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
-; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k0, %k7, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k7
-; KNL_X32-NEXT:    kshiftlw $3, %k7, %k7
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
-; KNL_X32-NEXT:    korw %k0, %k7, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k7
-; KNL_X32-NEXT:    kshiftlw $4, %k7, %k7
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
-; KNL_X32-NEXT:    korw %k0, %k7, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k7
-; KNL_X32-NEXT:    kshiftlw $5, %k7, %k7
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
-; KNL_X32-NEXT:    korw %k0, %k7, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k7
-; KNL_X32-NEXT:    kshiftlw $6, %k7, %k7
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
-; KNL_X32-NEXT:    korw %k0, %k7, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k7
-; KNL_X32-NEXT:    kshiftlw $7, %k7, %k7
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k0, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
-; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $14, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $14, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $13, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $12, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $11, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k2
-; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $10, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k0
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kandw %k6, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    korw %k0, %k2, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $15, %k2, %k2
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kandw %k2, %k7, %k2
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $14, %k2, %k2
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
-; KNL_X32-NEXT:    kshiftrw $3, %k1, %k2
-; KNL_X32-NEXT:    kshiftlw $3, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $14, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $14, %k1, %k1
-; KNL_X32-NEXT:    korw %k2, %k1, %k1
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k2, %k2
+; KNL_X32-NEXT:    kandw %k1, %k2, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
-; KNL_X32-NEXT:    kshiftrw $4, %k1, %k2
-; KNL_X32-NEXT:    kshiftlw $4, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $13, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
+; KNL_X32-NEXT:    kandw %k3, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
-; KNL_X32-NEXT:    kshiftrw $5, %k1, %k2
-; KNL_X32-NEXT:    kshiftlw $5, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $12, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $12, %k1, %k1
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
+; KNL_X32-NEXT:    kandw %k4, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
-; KNL_X32-NEXT:    kshiftrw $6, %k1, %k2
-; KNL_X32-NEXT:    kshiftlw $6, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $11, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $11, %k1, %k1
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
+; KNL_X32-NEXT:    kandw %k5, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
-; KNL_X32-NEXT:    kshiftrw $7, %k1, %k2
-; KNL_X32-NEXT:    kshiftlw $7, %k2, %k2
-; KNL_X32-NEXT:    kshiftlw $10, %k1, %k1
-; KNL_X32-NEXT:    kshiftrw $10, %k1, %k1
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
+; KNL_X32-NEXT:    kandw %k6, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    korw %k1, %k2, %k1
+; KNL_X32-NEXT:    korw %k2, %k1, %k1
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kandw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
-; KNL_X32-NEXT:    kmovw (%esp), %k1 ## 2-byte Reload
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
@@ -3308,7 +2697,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andb $127, %cl
 ; KNL_X32-NEXT:    movb %cl, (%eax)
-; KNL_X32-NEXT:    addl $4, %esp
+; KNL_X32-NEXT:    addl $16, %esp
 ; KNL_X32-NEXT:    popl %ebx
 ; KNL_X32-NEXT:    retl $4
   %j = and <7 x i1> %a, %b

diff  --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index fcb07a504067..349a7d63a4cf 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -1886,474 +1886,432 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
 define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: test21:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    movw $-3, %ax
+; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k2
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    kmovw %esi, %k1
-; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k3
+; KNL-NEXT:    movw $-5, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %edx, %k1
-; KNL-NEXT:    kshiftlw $2, %k1, %k1
-; KNL-NEXT:    korw %k1, %k3, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k4
+; KNL-NEXT:    movw $-9, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %ecx, %k1
-; KNL-NEXT:    kshiftlw $3, %k1, %k1
-; KNL-NEXT:    korw %k1, %k4, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $12, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k5
+; KNL-NEXT:    movw $-17, %ax
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    kmovw %r8d, %k1
-; KNL-NEXT:    kshiftlw $4, %k1, %k1
-; KNL-NEXT:    korw %k1, %k5, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $11, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k6
+; KNL-NEXT:    movw $-33, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k3
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    kmovw %r9d, %k1
-; KNL-NEXT:    kshiftlw $5, %k1, %k1
-; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $10, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k7
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    movw $-65, %ax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $6, %k1, %k1
-; KNL-NEXT:    korw %k1, %k7, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $9, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $7, %k1, %k1
-; KNL-NEXT:    kshiftlw $8, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $9, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $8, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    movw $-129, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k4
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $8, %k1, %k1
-; KNL-NEXT:    kshiftlw $9, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $8, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $9, %k1, %k1
-; KNL-NEXT:    kshiftlw $10, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $7, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k5
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $10, %k1, %k1
-; KNL-NEXT:    kshiftlw $11, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $6, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $11, %k1, %k1
-; KNL-NEXT:    kshiftlw $12, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $5, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $12, %k1, %k1
-; KNL-NEXT:    kshiftlw $13, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $4, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $13, %k1, %k1
-; KNL-NEXT:    kshiftlw $14, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $3, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k0, %k2, %k0
-; KNL-NEXT:    korw %k0, %k1, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $2, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k1
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    movw $-16385, %ax # imm = 0xBFFF
 ; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k0, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $2, %k1, %k1
-; KNL-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k3, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $3, %k1, %k1
-; KNL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k4, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $4, %k1, %k1
-; KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k5, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $5, %k1, %k1
-; KNL-NEXT:    korw %k1, %k6, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    kandw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $6, %k1, %k1
-; KNL-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    korw %k1, %k7, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $9, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k6, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $7, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $8, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $8, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $9, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k4, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $10, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $8, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $11, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $7, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k5, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $12, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $6, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; KNL-NEXT:    kandw %k4, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $13, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $5, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $14, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $4, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    korw %k1, %k0, %k1
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $3, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $2, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $2, %k1, %k1
-; KNL-NEXT:    korw %k1, %k3, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $3, %k1, %k1
-; KNL-NEXT:    korw %k1, %k4, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $4, %k1, %k1
-; KNL-NEXT:    korw %k1, %k5, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k0, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $5, %k1, %k1
-; KNL-NEXT:    korw %k1, %k6, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $6, %k1, %k1
-; KNL-NEXT:    korw %k1, %k7, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $9, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k2, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $8, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $8, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k3, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k6, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $9, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k4, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    kandw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $10, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k5, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $8, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $11, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k7, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $7, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $12, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k7, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $6, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k4, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $13, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k7, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $5, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kandw %k5, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $14, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; KNL-NEXT:    korw %k1, %k7, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $4, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    kandw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    korw %k1, %k0, %k1
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $3, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kshiftlw $1, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $2, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k1, %k1
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kshiftlw $1, %k1, %k1
+; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $15, %k7, %k7
-; KNL-NEXT:    korw %k0, %k7, %k0
+; KNL-NEXT:    korw %k7, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $2, %k7, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    korw %k7, %k1, %k7
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k7, %k7
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k7, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $3, %k7, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    korw %k7, %k1, %k7
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $4, %k7, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    korw %k7, %k1, %k7
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kandw %k0, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $5, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k7
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $6, %k7, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    korw %k7, %k1, %k7
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $7, %k7, %k7
-; KNL-NEXT:    korw %k7, %k2, %k7
-; KNL-NEXT:    kshiftlw $9, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k0
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $8, %k7, %k7
-; KNL-NEXT:    korw %k7, %k3, %k7
-; KNL-NEXT:    kshiftlw $8, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k0
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $8, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k4, %k7
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k0
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $7, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kandw %k3, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k5, %k6
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
-; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $6, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kandw %k4, %k6, %k5
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k6
-; KNL-NEXT:    kshiftlw $11, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k6, %k2, %k5
-; KNL-NEXT:    kshiftlw $5, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k0
-; KNL-NEXT:    korw %k5, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $5, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k5, %k4
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kshiftlw $12, %k5, %k5
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k5, %k2, %k4
-; KNL-NEXT:    kshiftlw $4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k0
-; KNL-NEXT:    korw %k4, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k5, %k5
+; KNL-NEXT:    kshiftrw $4, %k5, %k5
+; KNL-NEXT:    korw %k5, %k4, %k4
+; KNL-NEXT:    kandw %k2, %k4, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kshiftlw $13, %k4, %k4
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    korw %k4, %k2, %k3
-; KNL-NEXT:    kshiftlw $3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k0
-; KNL-NEXT:    korw %k3, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $3, %k4, %k4
+; KNL-NEXT:    korw %k4, %k3, %k3
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k3, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kshiftlw $14, %k3, %k3
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    kshiftlw $15, %k3, %k3
+; KNL-NEXT:    kshiftrw $2, %k3, %k3
 ; KNL-NEXT:    korw %k3, %k2, %k2
-; KNL-NEXT:    kshiftlw $2, %k0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k2, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $14, %k2, %k2
 ; KNL-NEXT:    korw %k2, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
@@ -2362,7 +2320,6 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    korw %k2, %k0, %k2
 ; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
 ; KNL-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
 ; KNL-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
@@ -2389,475 +2346,433 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ;
 ; AVX512DQNOBW-LABEL: test21:
 ; AVX512DQNOBW:       # %bb.0:
+; AVX512DQNOBW-NEXT:    movw $-3, %ax
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
 ; AVX512DQNOBW-NEXT:    kmovw %edi, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $2, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k1, %k2
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    kmovw %esi, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $1, %k1, %k1
-; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $3, %k0, %k3
+; AVX512DQNOBW-NEXT:    movw $-5, %ax
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw %edx, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $2, %k1, %k1
-; AVX512DQNOBW-NEXT:    korw %k1, %k3, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $4, %k0, %k4
+; AVX512DQNOBW-NEXT:    movw $-9, %ax
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw %ecx, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512DQNOBW-NEXT:    korw %k1, %k4, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $5, %k0, %k5
+; AVX512DQNOBW-NEXT:    movw $-17, %ax
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    kmovw %r8d, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $4, %k1, %k1
-; AVX512DQNOBW-NEXT:    korw %k1, %k5, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $11, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $6, %k0, %k6
+; AVX512DQNOBW-NEXT:    movw $-33, %ax
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k1, %k3
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    kmovw %r9d, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $5, %k1, %k1
-; AVX512DQNOBW-NEXT:    korw %k1, %k6, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $10, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $7, %k0, %k7
+; AVX512DQNOBW-NEXT:    movw $-65, %ax
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $6, %k1, %k1
-; AVX512DQNOBW-NEXT:    korw %k1, %k7, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $9, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQNOBW-NEXT:    movw $-129, %ax
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $7, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $8, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQNOBW-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $8, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $9, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512DQNOBW-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k1, %k5
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $9, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $10, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $6, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512DQNOBW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $10, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $11, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $5, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512DQNOBW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $11, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $12, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $4, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQNOBW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $12, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $13, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $3, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512DQNOBW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $13, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k0, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k1, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k1
 ; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k0, %k1, %k0
-; AVX512DQNOBW-NEXT:    korw %k0, %k2, %k0
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $2, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k2, %k3, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k2, %k4, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $4, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k2, %k5, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $11, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $5, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k6, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $10, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $6, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $9, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $7, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $8, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $9, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $6, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $10, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $5, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $11, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $4, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $12, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $3, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $13, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $2, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k3, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $3, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k4, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $4, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k5, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $11, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $5, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k6, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $10, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $6, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $9, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $7, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $8, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k3, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $9, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k4, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $6, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $10, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k5, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $5, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $11, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $4, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $12, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $3, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $13, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftlw $2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
-; AVX512DQNOBW-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k2, %k0, %k2
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k2, %k7, %k2
+; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $2, %k7, %k7
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k7, %k7
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k7, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $3, %k7, %k7
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $13, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $4, %k7, %k7
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $12, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
+; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $5, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $11, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $6, %k7, %k7
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $10, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $7, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k1, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $9, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $8, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k3, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $8, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $9, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k4, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $7, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k7, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
+; AVX512DQNOBW-NEXT:    kandw %k3, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $10, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k5, %k6
-; AVX512DQNOBW-NEXT:    kshiftlw $6, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k6, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k7, %k7
+; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
+; AVX512DQNOBW-NEXT:    kandw %k4, %k6, %k5
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
-; AVX512DQNOBW-NEXT:    kshiftlw $11, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k5, %k5
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k6, %k1, %k5
-; AVX512DQNOBW-NEXT:    kshiftlw $5, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k5, %k2, %k2
+; AVX512DQNOBW-NEXT:    kandw %k1, %k5, %k4
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
-; AVX512DQNOBW-NEXT:    kshiftlw $12, %k5, %k5
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k5, %k1, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $4, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k4, %k2, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k5, %k5
+; AVX512DQNOBW-NEXT:    korw %k5, %k4, %k4
+; AVX512DQNOBW-NEXT:    kandw %k2, %k4, %k3
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
-; AVX512DQNOBW-NEXT:    kshiftlw $13, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k4, %k4
+; AVX512DQNOBW-NEXT:    korw %k4, %k3, %k3
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k4, %k1, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $3, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k3, %k2, %k2
+; AVX512DQNOBW-NEXT:    kandw %k1, %k3, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k3, %k3
+; AVX512DQNOBW-NEXT:    korw %k3, %k2, %k2
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    korw %k3, %k1, %k1
-; AVX512DQNOBW-NEXT:    kshiftlw $2, %k2, %k2
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k2, %k2
-; AVX512DQNOBW-NEXT:    korw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    kandw %k1, %k2, %k1
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k2
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k1
 ; AVX512DQNOBW-NEXT:    kshiftlw $1, %k1, %k1
 ; AVX512DQNOBW-NEXT:    kshiftrw $1, %k1, %k1
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
@@ -2865,7 +2780,6 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQNOBW-NEXT:    korw %k2, %k1, %k1
 ; AVX512DQNOBW-NEXT:    vpmovm2d %k1, %zmm4
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    vpmovm2d %k0, %zmm5
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    vpmovm2d %k0, %zmm6

diff  --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 6e36bd1bb0eb..6139928cbd15 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -302,14 +302,12 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movb (%rdi), %al
 ; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k1
-; KNL-NEXT:    kshiftlw $11, %k1, %k1
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $5, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    movw $-1025, %cx ## imm = 0xFBFF
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $5, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -319,13 +317,11 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovb (%rdi), %k0
 ; SKX-NEXT:    kmovd %esi, %k1
-; SKX-NEXT:    kshiftrw $11, %k1, %k2
-; SKX-NEXT:    kshiftlw $11, %k2, %k2
-; SKX-NEXT:    kshiftlw $6, %k1, %k1
-; SKX-NEXT:    kshiftrw $6, %k1, %k1
+; SKX-NEXT:    movw $-1025, %ax ## imm = 0xFBFF
+; SKX-NEXT:    kmovd %eax, %k2
+; SKX-NEXT:    kandw %k2, %k1, %k1
 ; SKX-NEXT:    kshiftlw $15, %k0, %k0
 ; SKX-NEXT:    kshiftrw $5, %k0, %k0
-; SKX-NEXT:    korw %k0, %k2, %k0
 ; SKX-NEXT:    korw %k0, %k1, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -342,15 +338,13 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movb (%rdi), %al
 ; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k1
-; KNL-NEXT:    kshiftlw $5, %k1, %k1
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    movw $-17, %cx
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $11, %k1, %k1
-; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def $al killed $al killed $eax
 ; KNL-NEXT:    retq
@@ -359,13 +353,11 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovb (%rdi), %k0
 ; SKX-NEXT:    kmovd %esi, %k1
-; SKX-NEXT:    kshiftrb $5, %k1, %k2
-; SKX-NEXT:    kshiftlb $5, %k2, %k2
-; SKX-NEXT:    kshiftlb $4, %k1, %k1
-; SKX-NEXT:    kshiftrb $4, %k1, %k1
+; SKX-NEXT:    movb $-17, %al
+; SKX-NEXT:    kmovd %eax, %k2
+; SKX-NEXT:    kandb %k2, %k1, %k1
 ; SKX-NEXT:    kshiftlb $7, %k0, %k0
 ; SKX-NEXT:    kshiftrb $3, %k0, %k0
-; SKX-NEXT:    korb %k0, %k2, %k0
 ; SKX-NEXT:    korb %k0, %k1, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $al killed $al killed $eax
@@ -801,15 +793,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
-; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k1
-; KNL-NEXT:    kshiftlw $5, %k1, %k1
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    movw $-17, %dx
+; KNL-NEXT:    kmovw %edx, %k1
+; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0 {%k1}
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $11, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    orl %ecx, %eax
@@ -823,14 +812,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
 ; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k1
 ; SKX-NEXT:    kunpckwd %k0, %k1, %k0
-; SKX-NEXT:    kshiftrd $5, %k0, %k1
-; SKX-NEXT:    kshiftld $5, %k1, %k1
-; SKX-NEXT:    kshiftld $28, %k0, %k0
-; SKX-NEXT:    kshiftrd $28, %k0, %k0
-; SKX-NEXT:    kmovd %eax, %k2
-; SKX-NEXT:    kshiftld $31, %k2, %k2
-; SKX-NEXT:    kshiftrd $27, %k2, %k2
-; SKX-NEXT:    kord %k2, %k1, %k1
+; SKX-NEXT:    movl $-17, %ecx
+; SKX-NEXT:    kmovd %ecx, %k1
+; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $27, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    vzeroupper
@@ -849,16 +836,13 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    cmpl %esi, %edi
 ; KNL-NEXT:    setb %al
-; KNL-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k1
-; KNL-NEXT:    kshiftlw $3, %k1, %k1
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    movw $-5, %cx
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $13, %k1, %k1
-; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
@@ -868,16 +852,13 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
-; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k0
-; SKX-NEXT:    kshiftrb $3, %k0, %k1
-; SKX-NEXT:    kshiftlb $3, %k1, %k1
-; SKX-NEXT:    kshiftlb $6, %k0, %k0
-; SKX-NEXT:    kshiftrb $6, %k0, %k0
-; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    movb $-5, %cl
+; SKX-NEXT:    kmovd %ecx, %k1
+; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k0 {%k1}
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
-; SKX-NEXT:    korw %k0, %k1, %k0
+; SKX-NEXT:    korw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index ff0f1d34076d..1fb0b3891520 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1069,15 +1069,13 @@ define <64 x i8> @test16(i64 %x) {
 ; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kmovw %edi, %k3
-; KNL-NEXT:    kshiftrw $6, %k0, %k4
-; KNL-NEXT:    kshiftlw $6, %k4, %k4
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    movw $-33, %ax
+; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb $1, %al
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $10, %k5, %k5
-; KNL-NEXT:    korw %k5, %k4, %k4
+; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $10, %k4, %k4
 ; KNL-NEXT:    korw %k4, %k0, %k4
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1095,15 +1093,13 @@ define <64 x i8> @test16(i64 %x) {
 ; SKX-LABEL: test16:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovq %rdi, %k0
-; SKX-NEXT:    kshiftrq $6, %k0, %k1
-; SKX-NEXT:    kshiftlq $6, %k1, %k1
-; SKX-NEXT:    kshiftlq $59, %k0, %k0
-; SKX-NEXT:    kshiftrq $59, %k0, %k0
+; SKX-NEXT:    movq $-33, %rax
+; SKX-NEXT:    kmovq %rax, %k1
+; SKX-NEXT:    kandq %k1, %k0, %k0
 ; SKX-NEXT:    movb $1, %al
-; SKX-NEXT:    kmovd %eax, %k2
-; SKX-NEXT:    kshiftlq $63, %k2, %k2
-; SKX-NEXT:    kshiftrq $58, %k2, %k2
-; SKX-NEXT:    korq %k2, %k1, %k1
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    kshiftlq $63, %k1, %k1
+; SKX-NEXT:    kshiftrq $58, %k1, %k1
 ; SKX-NEXT:    korq %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    retq
@@ -1111,15 +1107,13 @@ define <64 x i8> @test16(i64 %x) {
 ; AVX512BW-LABEL: test16:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovq %rdi, %k0
-; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
-; AVX512BW-NEXT:    kshiftlq $6, %k1, %k1
-; AVX512BW-NEXT:    kshiftlq $59, %k0, %k0
-; AVX512BW-NEXT:    kshiftrq $59, %k0, %k0
+; AVX512BW-NEXT:    movq $-33, %rax
+; AVX512BW-NEXT:    kmovq %rax, %k1
+; AVX512BW-NEXT:    kandq %k1, %k0, %k0
 ; AVX512BW-NEXT:    movb $1, %al
-; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    kshiftlq $63, %k2, %k2
-; AVX512BW-NEXT:    kshiftrq $58, %k2, %k2
-; AVX512BW-NEXT:    korq %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $58, %k1, %k1
 ; AVX512BW-NEXT:    korq %k1, %k0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -1135,15 +1129,13 @@ define <64 x i8> @test16(i64 %x) {
 ; AVX512DQ-NEXT:    kmovw %ecx, %k0
 ; AVX512DQ-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-NEXT:    kmovw %edi, %k3
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
-; AVX512DQ-NEXT:    kshiftlw $6, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $11, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512DQ-NEXT:    movw $-33, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k4
+; AVX512DQ-NEXT:    kandw %k4, %k1, %k1
 ; AVX512DQ-NEXT:    movb $1, %al
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kshiftlw $15, %k5, %k5
-; AVX512DQ-NEXT:    kshiftrw $10, %k5, %k5
-; AVX512DQ-NEXT:    korw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kmovw %eax, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k4
 ; AVX512DQ-NEXT:    korw %k4, %k1, %k1
 ; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1194,14 +1186,12 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    cmpl %edx, %esi
 ; KNL-NEXT:    setg %al
-; KNL-NEXT:    kshiftrw $6, %k0, %k4
-; KNL-NEXT:    kshiftlw $6, %k4, %k4
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $10, %k5, %k5
-; KNL-NEXT:    korw %k5, %k4, %k4
+; KNL-NEXT:    movw $-33, %cx
+; KNL-NEXT:    kmovw %ecx, %k4
+; KNL-NEXT:    kandw %k4, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $10, %k4, %k4
 ; KNL-NEXT:    korw %k4, %k0, %k4
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1221,14 +1211,12 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; SKX-NEXT:    kmovq %rdi, %k0
 ; SKX-NEXT:    cmpl %edx, %esi
 ; SKX-NEXT:    setg %al
-; SKX-NEXT:    kshiftrq $6, %k0, %k1
-; SKX-NEXT:    kshiftlq $6, %k1, %k1
-; SKX-NEXT:    kshiftlq $59, %k0, %k0
-; SKX-NEXT:    kshiftrq $59, %k0, %k0
-; SKX-NEXT:    kmovd %eax, %k2
-; SKX-NEXT:    kshiftlq $63, %k2, %k2
-; SKX-NEXT:    kshiftrq $58, %k2, %k2
-; SKX-NEXT:    korq %k2, %k1, %k1
+; SKX-NEXT:    movq $-33, %rcx
+; SKX-NEXT:    kmovq %rcx, %k1
+; SKX-NEXT:    kandq %k1, %k0, %k0
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    kshiftlq $63, %k1, %k1
+; SKX-NEXT:    kshiftrq $58, %k1, %k1
 ; SKX-NEXT:    korq %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    retq
@@ -1238,14 +1226,12 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; AVX512BW-NEXT:    kmovq %rdi, %k0
 ; AVX512BW-NEXT:    cmpl %edx, %esi
 ; AVX512BW-NEXT:    setg %al
-; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
-; AVX512BW-NEXT:    kshiftlq $6, %k1, %k1
-; AVX512BW-NEXT:    kshiftlq $59, %k0, %k0
-; AVX512BW-NEXT:    kshiftrq $59, %k0, %k0
-; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    kshiftlq $63, %k2, %k2
-; AVX512BW-NEXT:    kshiftrq $58, %k2, %k2
-; AVX512BW-NEXT:    korq %k2, %k1, %k1
+; AVX512BW-NEXT:    movq $-33, %rcx
+; AVX512BW-NEXT:    kmovq %rcx, %k1
+; AVX512BW-NEXT:    kandq %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $58, %k1, %k1
 ; AVX512BW-NEXT:    korq %k1, %k0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -1263,14 +1249,12 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; AVX512DQ-NEXT:    kmovw %edi, %k3
 ; AVX512DQ-NEXT:    cmpl %edx, %esi
 ; AVX512DQ-NEXT:    setg %al
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
-; AVX512DQ-NEXT:    kshiftlw $6, %k4, %k4
-; AVX512DQ-NEXT:    kshiftlw $11, %k1, %k1
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k1
-; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kshiftlw $15, %k5, %k5
-; AVX512DQ-NEXT:    kshiftrw $10, %k5, %k5
-; AVX512DQ-NEXT:    korw %k5, %k4, %k4
+; AVX512DQ-NEXT:    movw $-33, %cx
+; AVX512DQ-NEXT:    kmovw %ecx, %k4
+; AVX512DQ-NEXT:    kandw %k4, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %eax, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k4
 ; AVX512DQ-NEXT:    korw %k4, %k1, %k1
 ; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1316,11 +1300,10 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; KNL-NEXT:    kmovw %esi, %k1
 ; KNL-NEXT:    kshiftrw $8, %k1, %k2
 ; KNL-NEXT:    kshiftrw $9, %k1, %k1
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k3
+; KNL-NEXT:    movw $-65, %ax
+; KNL-NEXT:    kmovw %eax, %k3
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    kshiftlw $6, %k1, %k1
-; KNL-NEXT:    korw %k1, %k3, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kshiftlw $9, %k0, %k0
 ; KNL-NEXT:    kshiftrw $9, %k0, %k0
@@ -1338,11 +1321,10 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; SKX-NEXT:    kmovd %esi, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX-NEXT:    kshiftrw $9, %k1, %k1
-; SKX-NEXT:    kshiftlb $2, %k0, %k0
-; SKX-NEXT:    kshiftrb $2, %k0, %k0
-; SKX-NEXT:    kshiftlb $7, %k0, %k3
+; SKX-NEXT:    movb $-65, %al
+; SKX-NEXT:    kmovd %eax, %k3
+; SKX-NEXT:    kandb %k3, %k0, %k0
 ; SKX-NEXT:    kshiftlb $6, %k1, %k1
-; SKX-NEXT:    korb %k1, %k3, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kshiftlb $1, %k0, %k0
 ; SKX-NEXT:    kshiftrb $1, %k0, %k0
@@ -1357,11 +1339,10 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    kshiftrw $8, %k1, %k2
 ; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
-; AVX512BW-NEXT:    kshiftlw $10, %k0, %k0
-; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
-; AVX512BW-NEXT:    kshiftlw $7, %k0, %k3
+; AVX512BW-NEXT:    movw $-65, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
 ; AVX512BW-NEXT:    kshiftlw $6, %k1, %k1
-; AVX512BW-NEXT:    korw %k1, %k3, %k1
 ; AVX512BW-NEXT:    korw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kshiftlw $9, %k0, %k0
 ; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
@@ -1378,11 +1359,10 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; AVX512DQ-NEXT:    kmovw %esi, %k1
 ; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
 ; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlb $2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrb $2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlb $7, %k0, %k3
+; AVX512DQ-NEXT:    movb $-65, %al
+; AVX512DQ-NEXT:    kmovw %eax, %k3
+; AVX512DQ-NEXT:    kandb %k3, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlb $6, %k1, %k1
-; AVX512DQ-NEXT:    korb %k1, %k3, %k1
 ; AVX512DQ-NEXT:    korb %k1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlb $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k0
@@ -1400,11 +1380,10 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-NEXT:    kshiftrw $9, %k1, %k1
-; X86-NEXT:    kshiftlb $7, %k0, %k3
-; X86-NEXT:    kshiftlb $2, %k0, %k0
-; X86-NEXT:    kshiftrb $2, %k0, %k0
+; X86-NEXT:    movb $-65, %al
+; X86-NEXT:    kmovd %eax, %k3
+; X86-NEXT:    kandb %k3, %k0, %k0
 ; X86-NEXT:    kshiftlb $6, %k1, %k1
-; X86-NEXT:    korb %k1, %k3, %k1
 ; X86-NEXT:    korb %k1, %k0, %k0
 ; X86-NEXT:    kshiftlb $1, %k0, %k0
 ; X86-NEXT:    kshiftrb $1, %k0, %k0
@@ -2834,476 +2813,434 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
 ;
 ; KNL-LABEL: store_64i1:
 ; KNL:       ## %bb.0:
+; KNL-NEXT:    movw $-3, %ax
+; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k2
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    kmovw %edx, %k1
-; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k3
+; KNL-NEXT:    movw $-5, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %ecx, %k1
-; KNL-NEXT:    kshiftlw $2, %k1, %k1
-; KNL-NEXT:    korw %k1, %k3, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k4
+; KNL-NEXT:    movw $-9, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %r8d, %k1
-; KNL-NEXT:    kshiftlw $3, %k1, %k1
-; KNL-NEXT:    korw %k1, %k4, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $12, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k5
+; KNL-NEXT:    movw $-17, %ax
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    kmovw %r9d, %k1
-; KNL-NEXT:    kshiftlw $4, %k1, %k1
-; KNL-NEXT:    korw %k1, %k5, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $11, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k6
+; KNL-NEXT:    movw $-33, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k3
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $5, %k1, %k1
-; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $10, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k7
+; KNL-NEXT:    movw $-65, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $6, %k1, %k1
-; KNL-NEXT:    korw %k1, %k7, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $9, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $9, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    movw $-129, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k4
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $7, %k1, %k1
-; KNL-NEXT:    kshiftlw $8, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $8, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $8, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    movw $-257, %ax ## imm = 0xFEFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $8, %k1, %k1
-; KNL-NEXT:    kshiftlw $9, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $7, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    movw $-513, %ax ## imm = 0xFDFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k5
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $9, %k1, %k1
-; KNL-NEXT:    kshiftlw $10, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $6, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    movw $-1025, %ax ## imm = 0xFBFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $10, %k1, %k1
-; KNL-NEXT:    kshiftlw $11, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $5, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $11, %k1, %k1
-; KNL-NEXT:    kshiftlw $12, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $4, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    movw $-4097, %ax ## imm = 0xEFFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $12, %k1, %k1
-; KNL-NEXT:    kshiftlw $13, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $3, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    movw $-8193, %ax ## imm = 0xDFFF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $13, %k1, %k1
-; KNL-NEXT:    kshiftlw $14, %k0, %k2
-; KNL-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $2, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    movw $-16385, %ax ## imm = 0xBFFF
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftlw $14, %k1, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k0, %k1, %k0
-; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $1, %k2, %k2
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $2, %k2, %k2
-; KNL-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k2, %k3, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $3, %k2, %k2
-; KNL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k2, %k4, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $4, %k2, %k2
-; KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k2, %k5, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $5, %k2, %k2
-; KNL-NEXT:    korw %k2, %k6, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $6, %k2, %k2
-; KNL-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $9, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $7, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $8, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $8, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $8, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $7, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $9, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $6, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $10, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $5, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $11, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $4, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $12, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $3, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $13, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $2, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $14, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $1, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $2, %k2, %k2
-; KNL-NEXT:    korw %k2, %k3, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $13, %k0, %k0
-; KNL-NEXT:    kshiftrw $13, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $3, %k2, %k2
-; KNL-NEXT:    korw %k2, %k4, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $12, %k0, %k0
-; KNL-NEXT:    kshiftrw $12, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $4, %k2, %k2
-; KNL-NEXT:    korw %k2, %k5, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $11, %k0, %k0
-; KNL-NEXT:    kshiftrw $11, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $5, %k2, %k2
-; KNL-NEXT:    korw %k2, %k6, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $10, %k0, %k0
-; KNL-NEXT:    kshiftrw $10, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $6, %k2, %k2
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $9, %k0, %k0
-; KNL-NEXT:    kshiftrw $9, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $7, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k1, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $8, %k0, %k0
-; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $8, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $8, %k2, %k2
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $7, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k3, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    kshiftrw $7, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $9, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k4, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $6, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $6, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $10, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k5, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $5, %k0, %k0
-; KNL-NEXT:    kshiftrw $5, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $5, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $11, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $4, %k0, %k0
-; KNL-NEXT:    kshiftrw $4, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $4, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $12, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $3, %k0, %k0
-; KNL-NEXT:    kshiftrw $3, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $3, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $13, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kshiftlw $2, %k0, %k0
-; KNL-NEXT:    kshiftrw $2, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $2, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $14, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k7, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k7
+; KNL-NEXT:    kshiftlw $14, %k7, %k7
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    kshiftlw $1, %k2, %k2
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; KNL-NEXT:    korw %k2, %k0, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $15, %k7, %k7
-; KNL-NEXT:    korw %k2, %k7, %k2
+; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $2, %k7, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; KNL-NEXT:    korw %k7, %k0, %k7
-; KNL-NEXT:    kshiftlw $14, %k2, %k2
-; KNL-NEXT:    kshiftrw $14, %k2, %k2
-; KNL-NEXT:    korw %k7, %k2, %k2
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k7, %k7
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k7, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $3, %k7, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; KNL-NEXT:    korw %k7, %k0, %k7
-; KNL-NEXT:    kshiftlw $13, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    korw %k7, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $13, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $4, %k7, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; KNL-NEXT:    korw %k7, %k0, %k7
-; KNL-NEXT:    kshiftlw $12, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    korw %k7, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $12, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kandw %k1, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $5, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k7
-; KNL-NEXT:    kshiftlw $11, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    korw %k7, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $11, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $6, %k7, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; KNL-NEXT:    korw %k7, %k0, %k7
-; KNL-NEXT:    kshiftlw $10, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    korw %k7, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $10, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $7, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k7
-; KNL-NEXT:    kshiftlw $9, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    korw %k7, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $9, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $8, %k7, %k7
-; KNL-NEXT:    korw %k7, %k3, %k7
-; KNL-NEXT:    kshiftlw $8, %k2, %k2
-; KNL-NEXT:    kshiftrw $8, %k2, %k2
-; KNL-NEXT:    korw %k7, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $8, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k4, %k7
-; KNL-NEXT:    kshiftlw $7, %k2, %k2
-; KNL-NEXT:    kshiftrw $7, %k2, %k2
-; KNL-NEXT:    korw %k7, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $7, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kandw %k3, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k5, %k6
-; KNL-NEXT:    kshiftlw $6, %k2, %k2
-; KNL-NEXT:    kshiftrw $6, %k2, %k2
-; KNL-NEXT:    korw %k6, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k7, %k7
+; KNL-NEXT:    kshiftrw $6, %k7, %k7
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kandw %k4, %k6, %k5
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k6
-; KNL-NEXT:    kshiftlw $11, %k6, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $5, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k6, %k1, %k5
-; KNL-NEXT:    kshiftlw $5, %k2, %k2
-; KNL-NEXT:    kshiftrw $5, %k2, %k2
-; KNL-NEXT:    korw %k5, %k2, %k2
+; KNL-NEXT:    kandw %k1, %k5, %k4
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k5
-; KNL-NEXT:    kshiftlw $12, %k5, %k5
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k5, %k1, %k4
-; KNL-NEXT:    kshiftlw $4, %k2, %k2
-; KNL-NEXT:    kshiftrw $4, %k2, %k2
-; KNL-NEXT:    korw %k4, %k2, %k2
+; KNL-NEXT:    kshiftlw $15, %k5, %k5
+; KNL-NEXT:    kshiftrw $4, %k5, %k5
+; KNL-NEXT:    korw %k5, %k4, %k4
+; KNL-NEXT:    kandw %k2, %k4, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
-; KNL-NEXT:    kshiftlw $13, %k4, %k4
+; KNL-NEXT:    kshiftlw $15, %k4, %k4
+; KNL-NEXT:    kshiftrw $3, %k4, %k4
+; KNL-NEXT:    korw %k4, %k3, %k3
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k4, %k1, %k3
-; KNL-NEXT:    kshiftlw $3, %k2, %k2
-; KNL-NEXT:    kshiftrw $3, %k2, %k2
-; KNL-NEXT:    korw %k3, %k2, %k2
+; KNL-NEXT:    kandw %k1, %k3, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
-; KNL-NEXT:    kshiftlw $14, %k3, %k3
+; KNL-NEXT:    kshiftlw $15, %k3, %k3
+; KNL-NEXT:    kshiftrw $2, %k3, %k3
+; KNL-NEXT:    korw %k3, %k2, %k2
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    korw %k3, %k1, %k1
-; KNL-NEXT:    kshiftlw $2, %k2, %k2
-; KNL-NEXT:    kshiftrw $2, %k2, %k2
-; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kandw %k1, %k2, %k1
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kshiftlw $14, %k2, %k2
+; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
 ; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
@@ -3311,7 +3248,6 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, 6(%rdi)
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
 ; KNL-NEXT:    kmovw %k0, 4(%rdi)
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
 ; KNL-NEXT:    kmovw %k0, 2(%rdi)
@@ -3337,476 +3273,434 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
 ;
 ; AVX512DQ-LABEL: store_64i1:
 ; AVX512DQ:       ## %bb.0:
+; AVX512DQ-NEXT:    movw $-3, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    kmovw %esi, %k0
-; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    kmovw %edx, %k1
-; AVX512DQ-NEXT:    kshiftlw $1, %k1, %k1
-; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k3
+; AVX512DQ-NEXT:    movw $-5, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %ecx, %k1
-; AVX512DQ-NEXT:    kshiftlw $2, %k1, %k1
-; AVX512DQ-NEXT:    korw %k1, %k3, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k4
+; AVX512DQ-NEXT:    movw $-9, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %r8d, %k1
-; AVX512DQ-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512DQ-NEXT:    korw %k1, %k4, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k5
+; AVX512DQ-NEXT:    movw $-17, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    kmovw %r9d, %k1
-; AVX512DQ-NEXT:    kshiftlw $4, %k1, %k1
-; AVX512DQ-NEXT:    korw %k1, %k5, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k6
+; AVX512DQ-NEXT:    movw $-33, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k1, %k3
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $5, %k1, %k1
-; AVX512DQ-NEXT:    korw %k1, %k6, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k7
+; AVX512DQ-NEXT:    movw $-65, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $6, %k1, %k1
-; AVX512DQ-NEXT:    korw %k1, %k7, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQ-NEXT:    movw $-129, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $7, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQ-NEXT:    movw $-257, %ax ## imm = 0xFEFF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $8, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512DQ-NEXT:    movw $-513, %ax ## imm = 0xFDFF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k1, %k5
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $9, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512DQ-NEXT:    movw $-1025, %ax ## imm = 0xFBFF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $10, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512DQ-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $11, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQ-NEXT:    movw $-4097, %ax ## imm = 0xEFFF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $12, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512DQ-NEXT:    movw $-8193, %ax ## imm = 0xDFFF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $13, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    movw $-16385, %ax ## imm = 0xBFFF
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kshiftlw $14, %k1, %k0
-; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
 ; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k0, %k1, %k0
-; AVX512DQ-NEXT:    korw %k0, %k2, %k0
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $14, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k0
-; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $14, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $2, %k2, %k2
-; AVX512DQ-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k2, %k3, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $13, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $3, %k2, %k2
-; AVX512DQ-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k2, %k4, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $12, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $4, %k2, %k2
-; AVX512DQ-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k2, %k5, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $11, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $5, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k6, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $10, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $6, %k2, %k2
-; AVX512DQ-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    korw %k2, %k7, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $9, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $7, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $8, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $8, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $7, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $9, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $6, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $10, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $5, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $11, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $4, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $12, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $3, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $13, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $2, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $14, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k0
-; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $14, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $2, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k3, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $13, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $3, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k4, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $12, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $4, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k5, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $11, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $5, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k6, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $10, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $6, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k7, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $9, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $7, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k1, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $8, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $8, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $7, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k3, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $9, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k4, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $6, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $10, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k5, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $5, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $11, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k7, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $4, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $12, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k7, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $3, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $13, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k7, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k0
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $2, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k7, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k7
+; AVX512DQ-NEXT:    kshiftlw $14, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512DQ-NEXT:    korw %k2, %k0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k2
-; AVX512DQ-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k2, %k0, %k2
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
 ; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $15, %k7, %k7
-; AVX512DQ-NEXT:    korw %k2, %k7, %k2
+; AVX512DQ-NEXT:    korw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $2, %k7, %k7
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k7, %k0, %k7
-; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k2
-; AVX512DQ-NEXT:    korw %k7, %k2, %k2
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k7, %k7
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $14, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k7, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $3, %k7, %k7
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k7, %k0, %k7
-; AVX512DQ-NEXT:    kshiftlw $13, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512DQ-NEXT:    korw %k7, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $13, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $4, %k7, %k7
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k7, %k0, %k7
-; AVX512DQ-NEXT:    kshiftlw $12, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k2
-; AVX512DQ-NEXT:    korw %k7, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $12, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k6, %k6
+; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $5, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k6, %k7
-; AVX512DQ-NEXT:    kshiftlw $11, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k2
-; AVX512DQ-NEXT:    korw %k7, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $11, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $6, %k7, %k7
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k7, %k0, %k7
-; AVX512DQ-NEXT:    kshiftlw $10, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k2
-; AVX512DQ-NEXT:    korw %k7, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $10, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $7, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k1, %k7
-; AVX512DQ-NEXT:    kshiftlw $9, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k2
-; AVX512DQ-NEXT:    korw %k7, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $9, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $8, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k3, %k7
-; AVX512DQ-NEXT:    kshiftlw $8, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k2
-; AVX512DQ-NEXT:    korw %k7, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $8, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $9, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k4, %k7
-; AVX512DQ-NEXT:    kshiftlw $7, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k2
-; AVX512DQ-NEXT:    korw %k7, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $7, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k6, %k6
+; AVX512DQ-NEXT:    kandw %k3, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $10, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k5, %k6
-; AVX512DQ-NEXT:    kshiftlw $6, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k2
-; AVX512DQ-NEXT:    korw %k6, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512DQ-NEXT:    kshiftrw $6, %k7, %k7
+; AVX512DQ-NEXT:    korw %k7, %k6, %k6
+; AVX512DQ-NEXT:    kandw %k4, %k6, %k5
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k6
-; AVX512DQ-NEXT:    kshiftlw $11, %k6, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k5, %k5
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k6, %k1, %k5
-; AVX512DQ-NEXT:    kshiftlw $5, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k2
-; AVX512DQ-NEXT:    korw %k5, %k2, %k2
+; AVX512DQ-NEXT:    kandw %k1, %k5, %k4
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k5
-; AVX512DQ-NEXT:    kshiftlw $12, %k5, %k5
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k5, %k1, %k4
-; AVX512DQ-NEXT:    kshiftlw $4, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k2
-; AVX512DQ-NEXT:    korw %k4, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512DQ-NEXT:    kshiftrw $4, %k5, %k5
+; AVX512DQ-NEXT:    korw %k5, %k4, %k4
+; AVX512DQ-NEXT:    kandw %k2, %k4, %k3
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k4
-; AVX512DQ-NEXT:    kshiftlw $13, %k4, %k4
+; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512DQ-NEXT:    kshiftrw $3, %k4, %k4
+; AVX512DQ-NEXT:    korw %k4, %k3, %k3
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k4, %k1, %k3
-; AVX512DQ-NEXT:    kshiftlw $3, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k2
-; AVX512DQ-NEXT:    korw %k3, %k2, %k2
+; AVX512DQ-NEXT:    kandw %k1, %k3, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
-; AVX512DQ-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k3
+; AVX512DQ-NEXT:    korw %k3, %k2, %k2
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    korw %k3, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlw $2, %k2, %k2
-; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k2
-; AVX512DQ-NEXT:    korw %k1, %k2, %k1
+; AVX512DQ-NEXT:    kandw %k1, %k2, %k1
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k2
+; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512DQ-NEXT:    korw %k2, %k1, %k1
 ; AVX512DQ-NEXT:    kshiftlw $1, %k1, %k1
 ; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k1
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
@@ -3814,7 +3708,6 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
 ; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512DQ-NEXT:    korw %k2, %k1, %k1
 ; AVX512DQ-NEXT:    kmovw %k1, 6(%rdi)
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kmovw %k0, 4(%rdi)
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kmovw %k0, 2(%rdi)

diff  --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 05d35eb7efa9..eb1fba14a200 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -4913,26 +4913,22 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
 ; AVX512F-LABEL: widen_masked_store:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    movw $-3, %ax
+; AVX512F-NEXT:    kmovw %eax, %k0
 ; AVX512F-NEXT:    andl $1, %esi
-; AVX512F-NEXT:    kmovw %esi, %k0
-; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-NEXT:    kshiftlw $2, %k1, %k1
-; AVX512F-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512F-NEXT:    kmovw %edx, %k2
-; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512F-NEXT:    kshiftrw $14, %k2, %k2
-; AVX512F-NEXT:    korw %k2, %k1, %k1
-; AVX512F-NEXT:    korw %k1, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512F-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-NEXT:    kmovw %esi, %k1
+; AVX512F-NEXT:    kandw %k0, %k1, %k0
+; AVX512F-NEXT:    kmovw %edx, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
 ; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-5, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
 ; AVX512F-NEXT:    kmovw %ecx, %k1
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512F-NEXT:    korw %k0, %k1, %k0
+; AVX512F-NEXT:    korw %k1, %k0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
@@ -4941,52 +4937,44 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
 ;
 ; AVX512VLDQ-LABEL: widen_masked_store:
 ; AVX512VLDQ:       ## %bb.0:
-; AVX512VLDQ-NEXT:    kmovw %esi, %k0
-; AVX512VLDQ-NEXT:    kshiftlb $7, %k0, %k0
-; AVX512VLDQ-NEXT:    kshiftrb $7, %k0, %k0
-; AVX512VLDQ-NEXT:    kshiftrb $2, %k0, %k1
-; AVX512VLDQ-NEXT:    kshiftlb $2, %k1, %k1
-; AVX512VLDQ-NEXT:    kshiftlb $7, %k0, %k0
-; AVX512VLDQ-NEXT:    kshiftrb $7, %k0, %k0
-; AVX512VLDQ-NEXT:    kmovw %edx, %k2
-; AVX512VLDQ-NEXT:    kshiftlb $7, %k2, %k2
-; AVX512VLDQ-NEXT:    kshiftrb $6, %k2, %k2
-; AVX512VLDQ-NEXT:    korb %k2, %k1, %k1
-; AVX512VLDQ-NEXT:    korb %k1, %k0, %k0
-; AVX512VLDQ-NEXT:    kshiftrb $3, %k0, %k1
-; AVX512VLDQ-NEXT:    kshiftlb $3, %k1, %k1
-; AVX512VLDQ-NEXT:    kshiftlb $6, %k0, %k0
-; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k0
+; AVX512VLDQ-NEXT:    movb $-3, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k0
+; AVX512VLDQ-NEXT:    kmovw %esi, %k1
+; AVX512VLDQ-NEXT:    kshiftlb $7, %k1, %k1
+; AVX512VLDQ-NEXT:    kshiftrb $7, %k1, %k1
+; AVX512VLDQ-NEXT:    kandw %k0, %k1, %k0
+; AVX512VLDQ-NEXT:    kmovw %edx, %k1
+; AVX512VLDQ-NEXT:    kshiftlb $7, %k1, %k1
+; AVX512VLDQ-NEXT:    kshiftrb $6, %k1, %k1
 ; AVX512VLDQ-NEXT:    korw %k1, %k0, %k0
+; AVX512VLDQ-NEXT:    movb $-5, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512VLDQ-NEXT:    kmovw %ecx, %k1
 ; AVX512VLDQ-NEXT:    kshiftlb $7, %k1, %k1
 ; AVX512VLDQ-NEXT:    kshiftrb $5, %k1, %k1
-; AVX512VLDQ-NEXT:    korw %k0, %k1, %k1
+; AVX512VLDQ-NEXT:    korw %k1, %k0, %k1
 ; AVX512VLDQ-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
 ; AVX512VLDQ-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: widen_masked_store:
 ; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movw $-3, %ax
+; AVX512VLBW-NEXT:    kmovd %eax, %k0
 ; AVX512VLBW-NEXT:    andl $1, %esi
-; AVX512VLBW-NEXT:    kmovw %esi, %k0
-; AVX512VLBW-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512VLBW-NEXT:    kshiftlw $2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512VLBW-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512VLBW-NEXT:    kmovd %edx, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $14, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    korw %k1, %k0, %k0
-; AVX512VLBW-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512VLBW-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512VLBW-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512VLBW-NEXT:    kmovw %esi, %k1
+; AVX512VLBW-NEXT:    kandw %k0, %k1, %k0
+; AVX512VLBW-NEXT:    kmovd %edx, %k1
+; AVX512VLBW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512VLBW-NEXT:    kshiftrw $14, %k1, %k1
 ; AVX512VLBW-NEXT:    korw %k1, %k0, %k0
+; AVX512VLBW-NEXT:    movw $-5, %ax
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512VLBW-NEXT:    kmovd %ecx, %k1
 ; AVX512VLBW-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512VLBW-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512VLBW-NEXT:    korw %k0, %k1, %k1
+; AVX512VLBW-NEXT:    korw %k1, %k0, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
 ; AVX512VLBW-NEXT:    retq
   call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)

diff  --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 3273efd422c8..543da1e999c6 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1130,439 +1130,438 @@ define void @v64i1_shuffle(<64 x i8>* %x, <64 x i8>* %y) "min-legal-vector-width
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k0
-; CHECK-NEXT:    kshiftrd $3, %k0, %k1
-; CHECK-NEXT:    kshiftlq $2, %k0, %k2
-; CHECK-NEXT:    kshiftlq $1, %k0, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $1, %k0, %k3
-; CHECK-NEXT:    kshiftlq $63, %k3, %k3
-; CHECK-NEXT:    kshiftrq $63, %k3, %k3
-; CHECK-NEXT:    korq %k2, %k3, %k2
-; CHECK-NEXT:    kshiftlq $3, %k0, %k3
-; CHECK-NEXT:    kshiftlq $2, %k1, %k1
-; CHECK-NEXT:    korq %k1, %k3, %k1
-; CHECK-NEXT:    kshiftrd $2, %k0, %k3
-; CHECK-NEXT:    kshiftlq $62, %k2, %k2
+; CHECK-NEXT:    kshiftrd $1, %k0, %k1
+; CHECK-NEXT:    movq $-3, %rax
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $63, %k0, %k2
 ; CHECK-NEXT:    kshiftrq $62, %k2, %k2
-; CHECK-NEXT:    korq %k1, %k2, %k1
-; CHECK-NEXT:    kshiftlq $4, %k0, %k2
-; CHECK-NEXT:    kshiftlq $3, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $5, %k0, %k3
-; CHECK-NEXT:    kshiftlq $61, %k1, %k1
-; CHECK-NEXT:    kshiftrq $61, %k1, %k1
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $5, %k0, %k2
-; CHECK-NEXT:    kshiftlq $4, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $4, %k0, %k3
-; CHECK-NEXT:    kshiftlq $60, %k1, %k1
-; CHECK-NEXT:    kshiftrq $60, %k1, %k1
+; CHECK-NEXT:    movq $-5, %rax
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $3, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $61, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $6, %k0, %k2
-; CHECK-NEXT:    kshiftlq $5, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $7, %k0, %k3
-; CHECK-NEXT:    kshiftlq $59, %k1, %k1
-; CHECK-NEXT:    kshiftrq $59, %k1, %k1
+; CHECK-NEXT:    movq $-9, %rax
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $2, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $60, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $7, %k0, %k2
-; CHECK-NEXT:    kshiftlq $6, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $6, %k0, %k3
-; CHECK-NEXT:    kshiftlq $58, %k1, %k1
-; CHECK-NEXT:    kshiftrq $58, %k1, %k1
+; CHECK-NEXT:    movq $-17, %rax
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $5, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $59, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $8, %k0, %k2
-; CHECK-NEXT:    kshiftlq $7, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $9, %k0, %k3
-; CHECK-NEXT:    kshiftlq $57, %k1, %k1
-; CHECK-NEXT:    kshiftrq $57, %k1, %k1
+; CHECK-NEXT:    movq $-33, %rax
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $4, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $58, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $9, %k0, %k2
-; CHECK-NEXT:    kshiftlq $8, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $8, %k0, %k3
-; CHECK-NEXT:    kshiftlq $56, %k1, %k1
-; CHECK-NEXT:    kshiftrq $56, %k1, %k1
+; CHECK-NEXT:    movq $-65, %rax
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $7, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $57, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $10, %k0, %k2
-; CHECK-NEXT:    kshiftlq $9, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $11, %k0, %k3
-; CHECK-NEXT:    kshiftlq $55, %k1, %k1
-; CHECK-NEXT:    kshiftrq $55, %k1, %k1
+; CHECK-NEXT:    movq $-129, %rax
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $6, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $56, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $11, %k0, %k2
-; CHECK-NEXT:    kshiftlq $10, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $10, %k0, %k3
-; CHECK-NEXT:    kshiftlq $54, %k1, %k1
-; CHECK-NEXT:    kshiftrq $54, %k1, %k1
+; CHECK-NEXT:    movq $-257, %rax # imm = 0xFEFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $9, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $55, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $12, %k0, %k2
-; CHECK-NEXT:    kshiftlq $11, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $13, %k0, %k3
-; CHECK-NEXT:    kshiftlq $53, %k1, %k1
-; CHECK-NEXT:    kshiftrq $53, %k1, %k1
+; CHECK-NEXT:    movq $-513, %rax # imm = 0xFDFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $8, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $54, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $13, %k0, %k2
-; CHECK-NEXT:    kshiftlq $12, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $12, %k0, %k3
-; CHECK-NEXT:    kshiftlq $52, %k1, %k1
-; CHECK-NEXT:    kshiftrq $52, %k1, %k1
+; CHECK-NEXT:    movq $-1025, %rax # imm = 0xFBFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $11, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $53, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $14, %k0, %k2
-; CHECK-NEXT:    kshiftlq $13, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $15, %k0, %k3
-; CHECK-NEXT:    kshiftlq $51, %k1, %k1
-; CHECK-NEXT:    kshiftrq $51, %k1, %k1
+; CHECK-NEXT:    movq $-2049, %rax # imm = 0xF7FF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $10, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $52, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $15, %k0, %k2
-; CHECK-NEXT:    kshiftlq $14, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $14, %k0, %k3
-; CHECK-NEXT:    kshiftlq $50, %k1, %k1
-; CHECK-NEXT:    kshiftrq $50, %k1, %k1
+; CHECK-NEXT:    movq $-4097, %rax # imm = 0xEFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $13, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $51, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $16, %k0, %k2
-; CHECK-NEXT:    kshiftlq $15, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $17, %k0, %k3
-; CHECK-NEXT:    kshiftlq $49, %k1, %k1
-; CHECK-NEXT:    kshiftrq $49, %k1, %k1
+; CHECK-NEXT:    movq $-8193, %rax # imm = 0xDFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $12, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $50, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $17, %k0, %k2
-; CHECK-NEXT:    kshiftlq $16, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $16, %k0, %k3
-; CHECK-NEXT:    kshiftlq $48, %k1, %k1
-; CHECK-NEXT:    kshiftrq $48, %k1, %k1
+; CHECK-NEXT:    movq $-16385, %rax # imm = 0xBFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $15, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $49, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $18, %k0, %k2
-; CHECK-NEXT:    kshiftlq $17, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $19, %k0, %k3
-; CHECK-NEXT:    kshiftlq $47, %k1, %k1
-; CHECK-NEXT:    kshiftrq $47, %k1, %k1
+; CHECK-NEXT:    movq $-32769, %rax # imm = 0xFFFF7FFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $14, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $48, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $19, %k0, %k2
-; CHECK-NEXT:    kshiftlq $18, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $18, %k0, %k3
-; CHECK-NEXT:    kshiftlq $46, %k1, %k1
-; CHECK-NEXT:    kshiftrq $46, %k1, %k1
+; CHECK-NEXT:    movq $-65537, %rax # imm = 0xFFFEFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $17, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $47, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $20, %k0, %k2
-; CHECK-NEXT:    kshiftlq $19, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $21, %k0, %k3
-; CHECK-NEXT:    kshiftlq $45, %k1, %k1
-; CHECK-NEXT:    kshiftrq $45, %k1, %k1
+; CHECK-NEXT:    movq $-131073, %rax # imm = 0xFFFDFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $16, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $46, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $21, %k0, %k2
-; CHECK-NEXT:    kshiftlq $20, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $20, %k0, %k3
-; CHECK-NEXT:    kshiftlq $44, %k1, %k1
-; CHECK-NEXT:    kshiftrq $44, %k1, %k1
+; CHECK-NEXT:    movq $-262145, %rax # imm = 0xFFFBFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $19, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $45, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $22, %k0, %k2
-; CHECK-NEXT:    kshiftlq $21, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $23, %k0, %k3
-; CHECK-NEXT:    kshiftlq $43, %k1, %k1
-; CHECK-NEXT:    kshiftrq $43, %k1, %k1
+; CHECK-NEXT:    movq $-524289, %rax # imm = 0xFFF7FFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $18, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $44, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $23, %k0, %k2
-; CHECK-NEXT:    kshiftlq $22, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $22, %k0, %k3
-; CHECK-NEXT:    kshiftlq $42, %k1, %k1
-; CHECK-NEXT:    kshiftrq $42, %k1, %k1
+; CHECK-NEXT:    movq $-1048577, %rax # imm = 0xFFEFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $21, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $43, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $24, %k0, %k2
-; CHECK-NEXT:    kshiftlq $23, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $25, %k0, %k3
-; CHECK-NEXT:    kshiftlq $41, %k1, %k1
-; CHECK-NEXT:    kshiftrq $41, %k1, %k1
+; CHECK-NEXT:    movq $-2097153, %rax # imm = 0xFFDFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $20, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $42, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $25, %k0, %k2
-; CHECK-NEXT:    kshiftlq $24, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $24, %k0, %k3
-; CHECK-NEXT:    kshiftlq $40, %k1, %k1
-; CHECK-NEXT:    kshiftrq $40, %k1, %k1
+; CHECK-NEXT:    movq $-4194305, %rax # imm = 0xFFBFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $23, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $41, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $26, %k0, %k2
-; CHECK-NEXT:    kshiftlq $25, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $27, %k0, %k3
-; CHECK-NEXT:    kshiftlq $39, %k1, %k1
-; CHECK-NEXT:    kshiftrq $39, %k1, %k1
+; CHECK-NEXT:    movq $-8388609, %rax # imm = 0xFF7FFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $22, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $40, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $27, %k0, %k2
-; CHECK-NEXT:    kshiftlq $26, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $26, %k0, %k3
-; CHECK-NEXT:    kshiftlq $38, %k1, %k1
-; CHECK-NEXT:    kshiftrq $38, %k1, %k1
+; CHECK-NEXT:    movq $-16777217, %rax # imm = 0xFEFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $25, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $39, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $28, %k0, %k2
-; CHECK-NEXT:    kshiftlq $27, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $29, %k0, %k3
-; CHECK-NEXT:    kshiftlq $37, %k1, %k1
-; CHECK-NEXT:    kshiftrq $37, %k1, %k1
+; CHECK-NEXT:    movq $-33554433, %rax # imm = 0xFDFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $24, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $38, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $29, %k0, %k2
-; CHECK-NEXT:    kshiftlq $28, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $28, %k0, %k3
-; CHECK-NEXT:    kshiftlq $36, %k1, %k1
-; CHECK-NEXT:    kshiftrq $36, %k1, %k1
+; CHECK-NEXT:    movq $-67108865, %rax # imm = 0xFBFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $27, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $37, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k1, %k1
-; CHECK-NEXT:    kshiftlq $30, %k0, %k2
-; CHECK-NEXT:    kshiftlq $29, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $31, %k0, %k3
-; CHECK-NEXT:    kshiftlq $35, %k1, %k1
-; CHECK-NEXT:    kshiftrq $35, %k1, %k1
-; CHECK-NEXT:    korq %k2, %k1, %k2
-; CHECK-NEXT:    kshiftlq $31, %k0, %k1
-; CHECK-NEXT:    kshiftlq $30, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k1, %k3
-; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
-; CHECK-NEXT:    kshiftrd $30, %k0, %k0
-; CHECK-NEXT:    kshiftlq $34, %k2, %k2
+; CHECK-NEXT:    movq $-134217729, %rax # imm = 0xF7FFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $26, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $36, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    movq $-268435457, %rax # imm = 0xEFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $29, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $35, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    movq $-536870913, %rax # imm = 0xDFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $28, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
 ; CHECK-NEXT:    kshiftrq $34, %k2, %k2
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftlq $32, %k0, %k3
-; CHECK-NEXT:    kshiftlq $31, %k0, %k0
-; CHECK-NEXT:    korq %k0, %k3, %k0
-; CHECK-NEXT:    kshiftrd $1, %k1, %k3
-; CHECK-NEXT:    kshiftlq $33, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    movq $-1073741825, %rax # imm = 0xBFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftrd $31, %k0, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
 ; CHECK-NEXT:    kshiftrq $33, %k2, %k2
-; CHECK-NEXT:    korq %k0, %k2, %k0
-; CHECK-NEXT:    kshiftlq $32, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k1, %k2
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
+; CHECK-NEXT:    kshiftrd $30, %k0, %k0
+; CHECK-NEXT:    kshiftlq $63, %k0, %k0
 ; CHECK-NEXT:    kshiftrq $32, %k0, %k0
-; CHECK-NEXT:    kshiftlq $33, %k0, %k2
-; CHECK-NEXT:    kshiftlq $32, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    korq %k0, %k2, %k0
+; CHECK-NEXT:    movabsq $-4294967297, %rax # imm = 0xFFFFFFFEFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $1, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $31, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $34, %k0, %k2
-; CHECK-NEXT:    kshiftlq $33, %k1, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $3, %k1, %k3
-; CHECK-NEXT:    kshiftlq $31, %k0, %k0
-; CHECK-NEXT:    kshiftrq $31, %k0, %k0
+; CHECK-NEXT:    movabsq $-8589934593, %rax # imm = 0xFFFFFFFDFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $63, %k1, %k2
+; CHECK-NEXT:    kshiftrq $30, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $35, %k0, %k2
-; CHECK-NEXT:    kshiftlq $34, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $2, %k1, %k3
-; CHECK-NEXT:    kshiftlq $30, %k0, %k0
-; CHECK-NEXT:    kshiftrq $30, %k0, %k0
+; CHECK-NEXT:    movabsq $-17179869185, %rax # imm = 0xFFFFFFFBFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $3, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $29, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $36, %k0, %k2
-; CHECK-NEXT:    kshiftlq $35, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $5, %k1, %k3
-; CHECK-NEXT:    kshiftlq $29, %k0, %k0
-; CHECK-NEXT:    kshiftrq $29, %k0, %k0
+; CHECK-NEXT:    movabsq $-34359738369, %rax # imm = 0xFFFFFFF7FFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $2, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $28, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $37, %k0, %k2
-; CHECK-NEXT:    kshiftlq $36, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $4, %k1, %k3
-; CHECK-NEXT:    kshiftlq $28, %k0, %k0
-; CHECK-NEXT:    kshiftrq $28, %k0, %k0
+; CHECK-NEXT:    movabsq $-68719476737, %rax # imm = 0xFFFFFFEFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $5, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $27, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $38, %k0, %k2
-; CHECK-NEXT:    kshiftlq $37, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $7, %k1, %k3
-; CHECK-NEXT:    kshiftlq $27, %k0, %k0
-; CHECK-NEXT:    kshiftrq $27, %k0, %k0
+; CHECK-NEXT:    movabsq $-137438953473, %rax # imm = 0xFFFFFFDFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $4, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $26, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $39, %k0, %k2
-; CHECK-NEXT:    kshiftlq $38, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $6, %k1, %k3
-; CHECK-NEXT:    kshiftlq $26, %k0, %k0
-; CHECK-NEXT:    kshiftrq $26, %k0, %k0
+; CHECK-NEXT:    movabsq $-274877906945, %rax # imm = 0xFFFFFFBFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $7, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $25, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $40, %k0, %k2
-; CHECK-NEXT:    kshiftlq $39, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $9, %k1, %k3
-; CHECK-NEXT:    kshiftlq $25, %k0, %k0
-; CHECK-NEXT:    kshiftrq $25, %k0, %k0
+; CHECK-NEXT:    movabsq $-549755813889, %rax # imm = 0xFFFFFF7FFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $6, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $24, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $41, %k0, %k2
-; CHECK-NEXT:    kshiftlq $40, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $8, %k1, %k3
-; CHECK-NEXT:    kshiftlq $24, %k0, %k0
-; CHECK-NEXT:    kshiftrq $24, %k0, %k0
+; CHECK-NEXT:    movabsq $-1099511627777, %rax # imm = 0xFFFFFEFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $9, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $23, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $42, %k0, %k2
-; CHECK-NEXT:    kshiftlq $41, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $11, %k1, %k3
-; CHECK-NEXT:    kshiftlq $23, %k0, %k0
-; CHECK-NEXT:    kshiftrq $23, %k0, %k0
+; CHECK-NEXT:    movabsq $-2199023255553, %rax # imm = 0xFFFFFDFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $8, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $22, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $43, %k0, %k2
-; CHECK-NEXT:    kshiftlq $42, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $10, %k1, %k3
-; CHECK-NEXT:    kshiftlq $22, %k0, %k0
-; CHECK-NEXT:    kshiftrq $22, %k0, %k0
+; CHECK-NEXT:    movabsq $-4398046511105, %rax # imm = 0xFFFFFBFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $11, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $21, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $44, %k0, %k2
-; CHECK-NEXT:    kshiftlq $43, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $13, %k1, %k3
-; CHECK-NEXT:    kshiftlq $21, %k0, %k0
-; CHECK-NEXT:    kshiftrq $21, %k0, %k0
+; CHECK-NEXT:    movabsq $-8796093022209, %rax # imm = 0xFFFFF7FFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $10, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $20, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $45, %k0, %k2
-; CHECK-NEXT:    kshiftlq $44, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $12, %k1, %k3
-; CHECK-NEXT:    kshiftlq $20, %k0, %k0
-; CHECK-NEXT:    kshiftrq $20, %k0, %k0
+; CHECK-NEXT:    movabsq $-17592186044417, %rax # imm = 0xFFFFEFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $13, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $19, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $46, %k0, %k2
-; CHECK-NEXT:    kshiftlq $45, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $15, %k1, %k3
-; CHECK-NEXT:    kshiftlq $19, %k0, %k0
-; CHECK-NEXT:    kshiftrq $19, %k0, %k0
+; CHECK-NEXT:    movabsq $-35184372088833, %rax # imm = 0xFFFFDFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $12, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $18, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $47, %k0, %k2
-; CHECK-NEXT:    kshiftlq $46, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $14, %k1, %k3
-; CHECK-NEXT:    kshiftlq $18, %k0, %k0
-; CHECK-NEXT:    kshiftrq $18, %k0, %k0
+; CHECK-NEXT:    movabsq $-70368744177665, %rax # imm = 0xFFFFBFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $15, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $17, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $48, %k0, %k2
-; CHECK-NEXT:    kshiftlq $47, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $17, %k1, %k3
-; CHECK-NEXT:    kshiftlq $17, %k0, %k0
-; CHECK-NEXT:    kshiftrq $17, %k0, %k0
+; CHECK-NEXT:    movabsq $-140737488355329, %rax # imm = 0xFFFF7FFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $14, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $16, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $49, %k0, %k2
-; CHECK-NEXT:    kshiftlq $48, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $16, %k1, %k3
-; CHECK-NEXT:    kshiftlq $16, %k0, %k0
-; CHECK-NEXT:    kshiftrq $16, %k0, %k0
+; CHECK-NEXT:    movabsq $-281474976710657, %rax # imm = 0xFFFEFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $17, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $15, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $50, %k0, %k2
-; CHECK-NEXT:    kshiftlq $49, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $19, %k1, %k3
-; CHECK-NEXT:    kshiftlq $15, %k0, %k0
-; CHECK-NEXT:    kshiftrq $15, %k0, %k0
+; CHECK-NEXT:    movabsq $-562949953421313, %rax # imm = 0xFFFDFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $16, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $14, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $51, %k0, %k2
-; CHECK-NEXT:    kshiftlq $50, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $18, %k1, %k3
-; CHECK-NEXT:    kshiftlq $14, %k0, %k0
-; CHECK-NEXT:    kshiftrq $14, %k0, %k0
+; CHECK-NEXT:    movabsq $-1125899906842625, %rax # imm = 0xFFFBFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $19, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $13, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $52, %k0, %k2
-; CHECK-NEXT:    kshiftlq $51, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $21, %k1, %k3
-; CHECK-NEXT:    kshiftlq $13, %k0, %k0
-; CHECK-NEXT:    kshiftrq $13, %k0, %k0
+; CHECK-NEXT:    movabsq $-2251799813685249, %rax # imm = 0xFFF7FFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $18, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $12, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $53, %k0, %k2
-; CHECK-NEXT:    kshiftlq $52, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $20, %k1, %k3
-; CHECK-NEXT:    kshiftlq $12, %k0, %k0
-; CHECK-NEXT:    kshiftrq $12, %k0, %k0
+; CHECK-NEXT:    movabsq $-4503599627370497, %rax # imm = 0xFFEFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $21, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $11, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $54, %k0, %k2
-; CHECK-NEXT:    kshiftlq $53, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $23, %k1, %k3
-; CHECK-NEXT:    kshiftlq $11, %k0, %k0
-; CHECK-NEXT:    kshiftrq $11, %k0, %k0
+; CHECK-NEXT:    movabsq $-9007199254740993, %rax # imm = 0xFFDFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $20, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $10, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $55, %k0, %k2
-; CHECK-NEXT:    kshiftlq $54, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $22, %k1, %k3
-; CHECK-NEXT:    kshiftlq $10, %k0, %k0
-; CHECK-NEXT:    kshiftrq $10, %k0, %k0
+; CHECK-NEXT:    movabsq $-18014398509481985, %rax # imm = 0xFFBFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $23, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $9, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $56, %k0, %k2
-; CHECK-NEXT:    kshiftlq $55, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $25, %k1, %k3
-; CHECK-NEXT:    kshiftlq $9, %k0, %k0
-; CHECK-NEXT:    kshiftrq $9, %k0, %k0
+; CHECK-NEXT:    movabsq $-36028797018963969, %rax # imm = 0xFF7FFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $22, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $8, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $57, %k0, %k2
-; CHECK-NEXT:    kshiftlq $56, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $24, %k1, %k3
-; CHECK-NEXT:    kshiftlq $8, %k0, %k0
-; CHECK-NEXT:    kshiftrq $8, %k0, %k0
+; CHECK-NEXT:    movabsq $-72057594037927937, %rax # imm = 0xFEFFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $25, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $7, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $58, %k0, %k2
-; CHECK-NEXT:    kshiftlq $57, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $27, %k1, %k3
-; CHECK-NEXT:    kshiftlq $7, %k0, %k0
-; CHECK-NEXT:    kshiftrq $7, %k0, %k0
+; CHECK-NEXT:    movabsq $-144115188075855873, %rax # imm = 0xFDFFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $24, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $6, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $59, %k0, %k2
-; CHECK-NEXT:    kshiftlq $58, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $26, %k1, %k3
-; CHECK-NEXT:    kshiftlq $6, %k0, %k0
-; CHECK-NEXT:    kshiftrq $6, %k0, %k0
+; CHECK-NEXT:    movabsq $-288230376151711745, %rax # imm = 0xFBFFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $27, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $5, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $60, %k0, %k2
-; CHECK-NEXT:    kshiftlq $59, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $29, %k1, %k3
-; CHECK-NEXT:    kshiftlq $5, %k0, %k0
-; CHECK-NEXT:    kshiftrq $5, %k0, %k0
+; CHECK-NEXT:    movabsq $-576460752303423489, %rax # imm = 0xF7FFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $26, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $4, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $61, %k0, %k2
-; CHECK-NEXT:    kshiftlq $60, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $28, %k1, %k3
-; CHECK-NEXT:    kshiftlq $4, %k0, %k0
-; CHECK-NEXT:    kshiftrq $4, %k0, %k0
+; CHECK-NEXT:    movabsq $-1152921504606846977, %rax # imm = 0xEFFFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $29, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $3, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $62, %k0, %k2
-; CHECK-NEXT:    kshiftlq $61, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftrd $31, %k1, %k3
-; CHECK-NEXT:    kshiftlq $3, %k0, %k0
-; CHECK-NEXT:    kshiftrq $3, %k0, %k0
+; CHECK-NEXT:    movabsq $-2305843009213693953, %rax # imm = 0xDFFFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $28, %k1, %k2
+; CHECK-NEXT:    kshiftlq $63, %k2, %k2
+; CHECK-NEXT:    kshiftrq $2, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
-; CHECK-NEXT:    kshiftlq $63, %k0, %k2
-; CHECK-NEXT:    kshiftlq $62, %k3, %k3
-; CHECK-NEXT:    korq %k3, %k2, %k2
-; CHECK-NEXT:    kshiftlq $2, %k0, %k0
-; CHECK-NEXT:    kshiftrq $2, %k0, %k0
+; CHECK-NEXT:    movabsq $-4611686018427387905, %rax # imm = 0xBFFFFFFFFFFFFFFF
+; CHECK-NEXT:    kmovq %rax, %k2
+; CHECK-NEXT:    kandq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $31, %k1, %k2
+; CHECK-NEXT:    kshiftlq $62, %k2, %k2
 ; CHECK-NEXT:    korq %k2, %k0, %k0
 ; CHECK-NEXT:    kshiftrd $30, %k1, %k1
 ; CHECK-NEXT:    kshiftlq $1, %k0, %k0

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 2cd2ebc43e38..a7412e4b6bef 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -1730,25 +1730,24 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
 ;
 ; AVX512-LABEL: smulo_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    vmovq %xmm1, %rdx
-; AVX512-NEXT:    vmovq %xmm0, %rsi
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
 ; AVX512-NEXT:    imulq %rdx, %rsi
 ; AVX512-NEXT:    seto %dl
+; AVX512-NEXT:    vmovq %rsi, %xmm0
 ; AVX512-NEXT:    imulq %rax, %rcx
-; AVX512-NEXT:    vmovq %rcx, %xmm0
-; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vmovq %rcx, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512-NEXT:    movw $-3, %cx
+; AVX512-NEXT:    kmovd %ecx, %k0
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    kandw %k0, %k1, %k0
 ; AVX512-NEXT:    kmovd %edx, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kshiftlw $2, %k0, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    kshiftrw $14, %k1, %k1
 ; AVX512-NEXT:    korw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -2249,10 +2248,10 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    movw $-3, %ax
 ; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512-NEXT:    kandw %k0, %k1, %k1
 ; AVX512-NEXT:    movl %edx, %eax
 ; AVX512-NEXT:    imulb %sil
 ; AVX512-NEXT:    movl %eax, %edx
@@ -2265,12 +2264,12 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512-NEXT:    korw %k2, %k0, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
-; AVX512-NEXT:    kshiftlw $14, %k1, %k1
-; AVX512-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512-NEXT:    kshiftlw $3, %k0, %k2
+; AVX512-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k2
+; AVX512-NEXT:    movw $-5, %ax
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    kandw %k1, %k2, %k2
 ; AVX512-NEXT:    movl %r11d, %eax
 ; AVX512-NEXT:    imulb %bl
 ; AVX512-NEXT:    movl %eax, %esi
@@ -2285,9 +2284,8 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    kmovd %eax, %k3
 ; AVX512-NEXT:    kshiftlw $2, %k3, %k3
 ; AVX512-NEXT:    korw %k3, %k2, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
-; AVX512-NEXT:    kshiftlw $13, %k1, %k1
-; AVX512-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512-NEXT:    kshiftlw $13, %k2, %k2
+; AVX512-NEXT:    kshiftrw $13, %k2, %k2
 ; AVX512-NEXT:    movl %r10d, %eax
 ; AVX512-NEXT:    imulb %r9b
 ; AVX512-NEXT:    # kill: def $al killed $al def $eax
@@ -2299,37 +2297,29 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    setne %bl
 ; AVX512-NEXT:    orb %cl, %bl
 ; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    kmovd %ecx, %k2
-; AVX512-NEXT:    kshiftlw $3, %k2, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    kmovd %ecx, %k3
+; AVX512-NEXT:    kshiftlw $3, %k3, %k3
+; AVX512-NEXT:    korw %k3, %k2, %k2
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kmovd %r8d, %k1
-; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k2} {z}
+; AVX512-NEXT:    kmovd %r8d, %k2
+; AVX512-NEXT:    kandw %k0, %k2, %k0
 ; AVX512-NEXT:    kmovd %edx, %k2
 ; AVX512-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512-NEXT:    kshiftrw $14, %k2, %k2
 ; AVX512-NEXT:    korw %k2, %k0, %k0
-; AVX512-NEXT:    korw %k0, %k1, %k0
-; AVX512-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512-NEXT:    kmovd %esi, %k2
-; AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
-; AVX512-NEXT:    korw %k1, %k0, %k0
-; AVX512-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512-NEXT:    kshiftlw $4, %k1, %k1
-; AVX512-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512-NEXT:    kandw %k1, %k0, %k0
+; AVX512-NEXT:    kmovd %esi, %k1
+; AVX512-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512-NEXT:    kshiftrw $13, %k1, %k1
 ; AVX512-NEXT:    korw %k1, %k0, %k0
+; AVX512-NEXT:    movw $-9, %cx
+; AVX512-NEXT:    kmovd %ecx, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k0
 ; AVX512-NEXT:    kmovd %eax, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $12, %k1, %k1
-; AVX512-NEXT:    korw %k0, %k1, %k0
+; AVX512-NEXT:    korw %k1, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    popq %rbx

diff  --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index cda9ee9ed172..e067bd7e048f 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -1528,27 +1528,25 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
 ;
 ; AVX512-LABEL: umulo_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    vpextrq $1, %xmm1, %r8
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vmovq %xmm1, %rdx
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
 ; AVX512-NEXT:    mulq %rdx
-; AVX512-NEXT:    movq %rax, %rsi
-; AVX512-NEXT:    seto %r9b
-; AVX512-NEXT:    movq %rcx, %rax
-; AVX512-NEXT:    mulq %r8
+; AVX512-NEXT:    seto %r8b
 ; AVX512-NEXT:    vmovq %rax, %xmm0
-; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    movq %rcx, %rax
+; AVX512-NEXT:    mulq %rsi
+; AVX512-NEXT:    vmovq %rax, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512-NEXT:    kmovd %r9d, %k1
+; AVX512-NEXT:    movw $-3, %cx
+; AVX512-NEXT:    kmovd %ecx, %k0
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    kandw %k0, %k1, %k0
+; AVX512-NEXT:    kmovd %r8d, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kshiftlw $2, %k0, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    kshiftrw $14, %k1, %k1
 ; AVX512-NEXT:    korw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -1983,10 +1981,10 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    movw $-3, %ax
 ; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512-NEXT:    kshiftlw $2, %k0, %k0
+; AVX512-NEXT:    kandw %k0, %k1, %k1
 ; AVX512-NEXT:    movl %edx, %eax
 ; AVX512-NEXT:    mulb %sil
 ; AVX512-NEXT:    movl %eax, %edx
@@ -1996,12 +1994,12 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    kmovd %eax, %k2
-; AVX512-NEXT:    kshiftlw $1, %k2, %k2
-; AVX512-NEXT:    korw %k2, %k0, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
-; AVX512-NEXT:    kshiftlw $14, %k1, %k1
-; AVX512-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512-NEXT:    kshiftlw $3, %k0, %k2
+; AVX512-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512-NEXT:    korw %k2, %k1, %k2
+; AVX512-NEXT:    movw $-5, %ax
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    kandw %k1, %k2, %k2
 ; AVX512-NEXT:    movl %r11d, %eax
 ; AVX512-NEXT:    mulb %bl
 ; AVX512-NEXT:    movl %eax, %esi
@@ -2013,9 +2011,8 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    kmovd %eax, %k3
 ; AVX512-NEXT:    kshiftlw $2, %k3, %k3
 ; AVX512-NEXT:    korw %k3, %k2, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
-; AVX512-NEXT:    kshiftlw $13, %k1, %k1
-; AVX512-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512-NEXT:    kshiftlw $13, %k2, %k2
+; AVX512-NEXT:    kshiftrw $13, %k2, %k2
 ; AVX512-NEXT:    movl %r9d, %eax
 ; AVX512-NEXT:    mulb %r10b
 ; AVX512-NEXT:    # kill: def $al killed $al def $eax
@@ -2024,37 +2021,29 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    setne %bl
 ; AVX512-NEXT:    orb %cl, %bl
 ; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    kmovd %ecx, %k2
-; AVX512-NEXT:    kshiftlw $3, %k2, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
+; AVX512-NEXT:    kmovd %ecx, %k3
+; AVX512-NEXT:    kshiftlw $3, %k3, %k3
+; AVX512-NEXT:    korw %k3, %k2, %k2
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kmovd %r8d, %k1
-; AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k2} {z}
+; AVX512-NEXT:    kmovd %r8d, %k2
+; AVX512-NEXT:    kandw %k0, %k2, %k0
 ; AVX512-NEXT:    kmovd %edx, %k2
 ; AVX512-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512-NEXT:    kshiftrw $14, %k2, %k2
 ; AVX512-NEXT:    korw %k2, %k0, %k0
-; AVX512-NEXT:    korw %k0, %k1, %k0
-; AVX512-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512-NEXT:    kmovd %esi, %k2
-; AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512-NEXT:    korw %k2, %k1, %k1
-; AVX512-NEXT:    korw %k1, %k0, %k0
-; AVX512-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512-NEXT:    kshiftlw $4, %k1, %k1
-; AVX512-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512-NEXT:    kandw %k1, %k0, %k0
+; AVX512-NEXT:    kmovd %esi, %k1
+; AVX512-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512-NEXT:    kshiftrw $13, %k1, %k1
 ; AVX512-NEXT:    korw %k1, %k0, %k0
+; AVX512-NEXT:    movw $-9, %cx
+; AVX512-NEXT:    kmovd %ecx, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k0
 ; AVX512-NEXT:    kmovd %eax, %k1
 ; AVX512-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512-NEXT:    kshiftrw $12, %k1, %k1
-; AVX512-NEXT:    korw %k0, %k1, %k0
+; AVX512-NEXT:    korw %k1, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    popq %rbx


        


More information about the llvm-commits mailing list