[llvm] b530461 - [DAG] SimplifyDemandedBits - fold AND(INSERT_SUBVECTOR(C,X,I),M) -> INSERT_SUBVECTOR(AND(C,M),X,I)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 8 08:08:46 PDT 2022


Author: Simon Pilgrim
Date: 2022-07-08T16:08:31+01:00
New Revision: b53046122fc5b4f087bfeee84ee6089b66fa7390

URL: https://github.com/llvm/llvm-project/commit/b53046122fc5b4f087bfeee84ee6089b66fa7390
DIFF: https://github.com/llvm/llvm-project/commit/b53046122fc5b4f087bfeee84ee6089b66fa7390.diff

LOG: [DAG] SimplifyDemandedBits - fold AND(INSERT_SUBVECTOR(C,X,I),M) -> INSERT_SUBVECTOR(AND(C,M),X,I)

If all the demanded bits of the AND mask covering the inserted subvector 'X' are known to be one, then the mask isn't affecting the subvector at all.

In which case, if the base vector 'C' is undef/constant, then move the AND mask up to just (constant) fold it directly.

Addresses some of the regressions from D129150, particularly the cases where we're attempting to zero the upper elements of a widened vector.

Differential Revision: https://reviews.llvm.org/D129290

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/X86/avx512-calling-conv.ll
    llvm/test/CodeGen/X86/avx512-ext.ll
    llvm/test/CodeGen/X86/avx512-mask-op.ll
    llvm/test/CodeGen/X86/min-legal-vector-width.ll
    llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll
    llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/vec_umulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a6b471ea22b7..6e2eee6a319b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1362,6 +1362,29 @@ bool TargetLowering::SimplifyDemandedBits(
       }
     }
 
+    // AND(INSERT_SUBVECTOR(C,X,I),M) -> INSERT_SUBVECTOR(AND(C,M),X,I)
+    // iff 'C' is Undef/Constant and AND(X,M) == X (for DemandedBits).
+    if (Op0.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        (Op0.getOperand(0).isUndef() ||
+         ISD::isBuildVectorOfConstantSDNodes(Op0.getOperand(0).getNode())) &&
+        Op0->hasOneUse()) {
+      unsigned NumSubElts =
+          Op0.getOperand(1).getValueType().getVectorNumElements();
+      unsigned SubIdx = Op0.getConstantOperandVal(2);
+      APInt DemandedSub =
+          APInt::getBitsSet(NumElts, SubIdx, SubIdx + NumSubElts);
+      KnownBits KnownSubMask =
+          TLO.DAG.computeKnownBits(Op1, DemandedSub & DemandedElts, Depth + 1);
+      if (DemandedBits.isSubsetOf(KnownSubMask.One)) {
+        SDValue NewAnd =
+            TLO.DAG.getNode(ISD::AND, dl, VT, Op0.getOperand(0), Op1);
+        SDValue NewInsert =
+            TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, NewAnd,
+                            Op0.getOperand(1), Op0.getOperand(2));
+        return TLO.CombineTo(Op, NewInsert);
+      }
+    }
+
     if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
                              Depth + 1))
       return true;

diff  --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index 9f797d07134b..055726f0c324 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -679,13 +679,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    pushq %r13
 ; KNL-NEXT:    pushq %r12
 ; KNL-NEXT:    pushq %rbx
+; KNL-NEXT:    xorl %r10d, %r10d
+; KNL-NEXT:    testb $1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    movl $65535, %eax ## imm = 0xFFFF
+; KNL-NEXT:    movl $0, %r11d
+; KNL-NEXT:    cmovnel %eax, %r11d
+; KNL-NEXT:    testb $1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    cmovnel %eax, %r10d
 ; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    movw $-3, %di
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -755,8 +759,8 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kshiftrw $7, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    movw $-513, %di ## imm = 0xFDFF
-; KNL-NEXT:    kmovw %edi, %k5
-; KNL-NEXT:    kandw %k5, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k7
+; KNL-NEXT:    kandw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -785,83 +789,82 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $3, %k1, %k1
-; KNL-NEXT:    korw %k1, %k0, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    movw $-8193, %di ## imm = 0xDFFF
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kandw %k0, %k1, %k1
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $2, %k6, %k6
-; KNL-NEXT:    korw %k6, %k1, %k6
+; KNL-NEXT:    kmovw %edi, %k5
+; KNL-NEXT:    kshiftlw $15, %k5, %k5
+; KNL-NEXT:    kshiftrw $2, %k5, %k5
+; KNL-NEXT:    korw %k5, %k0, %k5
 ; KNL-NEXT:    movw $-16385, %di ## imm = 0xBFFF
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kandw %k1, %k6, %k6
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kandw %k0, %k5, %k5
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kshiftlw $1, %k6, %k6
-; KNL-NEXT:    kshiftrw $1, %k6, %k6
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kshiftlw $1, %k5, %k5
+; KNL-NEXT:    kshiftrw $1, %k5, %k5
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    andl $1, %esi
+; KNL-NEXT:    kmovw %edx, %k5
+; KNL-NEXT:    kshiftlw $15, %k5, %k5
+; KNL-NEXT:    kshiftrw $14, %k5, %k5
 ; KNL-NEXT:    kmovw %esi, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k6, %k6
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k6, %k6
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k6, %k6
-; KNL-NEXT:    kmovw %r8d, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k6, %k6
-; KNL-NEXT:    kmovw %r9d, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k6, %k6
+; KNL-NEXT:    korw %k5, %k6, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k5, %k5
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k5, %k5
+; KNL-NEXT:    kmovw %r8d, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k5, %k5
+; KNL-NEXT:    kmovw %r9d, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k5, %k5
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k6, %k6
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k5, %k5
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k6, %k6
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k5, %k5
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $8, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k6, %k6
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $8, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k5, %k5
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $7, %k7, %k7
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kandw %k5, %k6, %k5
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $7, %k6, %k6
+; KNL-NEXT:    korw %k6, %k5, %k5
+; KNL-NEXT:    kandw %k7, %k5, %k5
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k6
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
@@ -885,34 +888,27 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL-NEXT:    kshiftrw $3, %k3, %k3
 ; KNL-NEXT:    korw %k3, %k2, %k2
-; KNL-NEXT:    kandw %k0, %k2, %k0
+; KNL-NEXT:    kandw %k1, %k2, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $2, %k2, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    xorl %ecx, %ecx
-; KNL-NEXT:    testb $1, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    movl $65535, %edx ## imm = 0xFFFF
-; KNL-NEXT:    movl $0, %esi
-; KNL-NEXT:    cmovnel %edx, %esi
-; KNL-NEXT:    testb $1, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    cmovnel %edx, %ecx
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k1
+; KNL-NEXT:    korw %k2, %k1, %k1
+; KNL-NEXT:    kandw %k0, %k1, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    kshiftlw $14, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; KNL-NEXT:    kmovw %edx, %k1
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kmovw %r11d, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %ecx, %k2
+; KNL-NEXT:    kmovw %r10d, %k2
 ; KNL-NEXT:    kandw %k1, %k2, %k1
 ; KNL-NEXT:    kmovw %k1, %r8d
 ; KNL-NEXT:    kshiftrw $1, %k0, %k1
@@ -1008,16 +1004,14 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    pushq %r13
 ; SKX-NEXT:    pushq %r12
 ; SKX-NEXT:    pushq %rbx
-; SKX-NEXT:    movq %rdi, %rax
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    movq %rdi, %rax
+; SKX-NEXT:    kshiftld $31, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    movl $-3, %edi
-; SKX-NEXT:    kmovd %edi, %k2
-; SKX-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT:    kandd %k2, %k0, %k0
+; SKX-NEXT:    kshiftrd $30, %k0, %k0
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
-; SKX-NEXT:    kshiftrd $30, %k1, %k1
-; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    kshiftrd $31, %k1, %k1
+; SKX-NEXT:    kord %k0, %k1, %k0
 ; SKX-NEXT:    movl $-5, %edi
 ; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
@@ -1034,10 +1028,10 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $28, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    movl $-17, %edi
 ; SKX-NEXT:    kmovd %edi, %k2
 ; SKX-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kandd %k2, %k0, %k0
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $27, %k1, %k1
@@ -1058,10 +1052,10 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $25, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    movl $-129, %edi
 ; SKX-NEXT:    kmovd %edi, %k2
 ; SKX-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kandd %k2, %k0, %k0
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $24, %k1, %k1
@@ -1082,10 +1076,11 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $22, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    movl $-1025, %edi ## imm = 0xFBFF
-; SKX-NEXT:    kmovd %edi, %k6
-; SKX-NEXT:    kandd %k6, %k0, %k0
+; SKX-NEXT:    kmovd %edi, %k2
+; SKX-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kandd %k2, %k0, %k0
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $21, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
@@ -1098,23 +1093,22 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftrd $20, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
 ; SKX-NEXT:    movl $-4097, %edi ## imm = 0xEFFF
-; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT:    kandd %k1, %k0, %k0
+; SKX-NEXT:    kmovd %edi, %k6
+; SKX-NEXT:    kandd %k6, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $19, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    movl $-8193, %edi ## imm = 0xDFFF
-; SKX-NEXT:    kmovd %edi, %k4
-; SKX-NEXT:    kandd %k4, %k0, %k0
+; SKX-NEXT:    kmovd %edi, %k5
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kandd %k5, %k0, %k0
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $18, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
 ; SKX-NEXT:    movl $-16385, %edi ## imm = 0xBFFF
-; SKX-NEXT:    kmovd %edi, %k5
-; SKX-NEXT:    kandd %k5, %k0, %k0
+; SKX-NEXT:    kmovd %edi, %k4
+; SKX-NEXT:    kandd %k4, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $17, %k1, %k1
@@ -1126,21 +1120,21 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftld $31, %k7, %k7
 ; SKX-NEXT:    kshiftrd $16, %k7, %k7
 ; SKX-NEXT:    kord %k7, %k0, %k7
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
 ; SKX-NEXT:    movl $-65537, %edi ## imm = 0xFFFEFFFF
 ; SKX-NEXT:    kmovd %edi, %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
 ; SKX-NEXT:    kandd %k2, %k7, %k7
 ; SKX-NEXT:    kshiftld $31, %k0, %k0
 ; SKX-NEXT:    kshiftrd $15, %k0, %k0
 ; SKX-NEXT:    kord %k0, %k7, %k0
 ; SKX-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT:    kmovd %esi, %k0
-; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT:    kandd %k1, %k0, %k0
-; SKX-NEXT:    kmovd %edx, %k7
+; SKX-NEXT:    kmovd %edx, %k0
+; SKX-NEXT:    kshiftld $31, %k0, %k0
+; SKX-NEXT:    kshiftrd $30, %k0, %k0
+; SKX-NEXT:    kmovd %esi, %k7
 ; SKX-NEXT:    kshiftld $31, %k7, %k7
-; SKX-NEXT:    kshiftrd $30, %k7, %k7
-; SKX-NEXT:    kord %k7, %k0, %k0
+; SKX-NEXT:    kshiftrd $31, %k7, %k7
+; SKX-NEXT:    kord %k0, %k7, %k0
 ; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
 ; SKX-NEXT:    kandd %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %ecx, %k7
@@ -1189,29 +1183,29 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $22, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
-; SKX-NEXT:    kandd %k6, %k0, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; SKX-NEXT:    kandd %k1, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $21, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 4-byte Reload
-; SKX-NEXT:    kandd %k6, %k0, %k0
+; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload
+; SKX-NEXT:    kandd %k7, %k0, %k0
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $20, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
 ; SKX-NEXT:    kord %k1, %k0, %k0
-; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT:    kandd %k1, %k0, %k0
-; SKX-NEXT:    kshiftld $31, %k6, %k1
+; SKX-NEXT:    kandd %k6, %k0, %k0
+; SKX-NEXT:    kshiftld $31, %k7, %k1
 ; SKX-NEXT:    kshiftrd $19, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
-; SKX-NEXT:    kandd %k4, %k0, %k0
+; SKX-NEXT:    kandd %k5, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $18, %k1, %k1
 ; SKX-NEXT:    kord %k1, %k0, %k0
-; SKX-NEXT:    kandd %k5, %k0, %k0
+; SKX-NEXT:    kandd %k4, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
 ; SKX-NEXT:    kshiftrd $17, %k1, %k1
@@ -1321,243 +1315,239 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    pushl %ebx
 ; KNL_X32-NEXT:    pushl %edi
 ; KNL_X32-NEXT:    pushl %esi
-; KNL_X32-NEXT:    subl $20, %esp
-; KNL_X32-NEXT:    movw $-3, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    subl $16, %esp
+; KNL_X32-NEXT:    xorl %eax, %eax
+; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    movl $65535, %edx ## imm = 0xFFFF
+; KNL_X32-NEXT:    movl $0, %ecx
+; KNL_X32-NEXT:    cmovnel %edx, %ecx
+; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    cmovnel %edx, %eax
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    andl $1, %edx
+; KNL_X32-NEXT:    kmovw %edx, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $14, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-5, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movw $-5, %dx
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-9, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movw $-9, %dx
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $12, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-17, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movw $-17, %dx
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $11, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-33, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movw $-33, %dx
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $10, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-65, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movw $-65, %dx
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $9, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-129, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movw $-129, %dx
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $8, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-257, %ax ## imm = 0xFEFF
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movw $-257, %dx ## imm = 0xFEFF
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $7, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-513, %ax ## imm = 0xFDFF
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movw $-513, %dx ## imm = 0xFDFF
+; KNL_X32-NEXT:    kmovw %edx, %k7
+; KNL_X32-NEXT:    kandw %k7, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $6, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-1025, %ax ## imm = 0xFBFF
-; KNL_X32-NEXT:    kmovw %eax, %k4
+; KNL_X32-NEXT:    movw $-1025, %dx ## imm = 0xFBFF
+; KNL_X32-NEXT:    kmovw %edx, %k4
 ; KNL_X32-NEXT:    kandw %k4, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $5, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
-; KNL_X32-NEXT:    kmovw %eax, %k3
+; KNL_X32-NEXT:    movw $-2049, %dx ## imm = 0xF7FF
+; KNL_X32-NEXT:    kmovw %edx, %k3
 ; KNL_X32-NEXT:    kandw %k3, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $4, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-4097, %ax ## imm = 0xEFFF
-; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    movw $-4097, %dx ## imm = 0xEFFF
+; KNL_X32-NEXT:    kmovw %edx, %k2
 ; KNL_X32-NEXT:    kandw %k2, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $3, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    movw $-8193, %ax ## imm = 0xDFFF
-; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    movw $-8193, %dx ## imm = 0xDFFF
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k5
+; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
+; KNL_X32-NEXT:    kshiftrw $2, %k5, %k5
+; KNL_X32-NEXT:    korw %k5, %k0, %k5
+; KNL_X32-NEXT:    movw $-16385, %dx ## imm = 0xBFFF
+; KNL_X32-NEXT:    kmovw %edx, %k0
+; KNL_X32-NEXT:    kandw %k0, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
+; KNL_X32-NEXT:    kshiftlw $14, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kshiftlw $1, %k5, %k5
+; KNL_X32-NEXT:    kshiftrw $1, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
 ; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $2, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k6
-; KNL_X32-NEXT:    movw $-16385, %ax ## imm = 0xBFFF
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k0, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kshiftlw $1, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $1, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kmovw %k6, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k7, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k7, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k7, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k7, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k7, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k7, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k7, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $8, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k7, %k6, %k6
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $7, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kandw %k5, %k6, %k5
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw %k5, (%esp) ## 2-byte Spill
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    andl $1, %edx
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; KNL_X32-NEXT:    kmovw %ebx, %k5
+; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
+; KNL_X32-NEXT:    kshiftrw $14, %k5, %k5
+; KNL_X32-NEXT:    kmovw %edx, %k6
+; KNL_X32-NEXT:    korw %k5, %k6, %k5
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $8, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $7, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
+; KNL_X32-NEXT:    kandw %k7, %k5, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k6
 ; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
 ; KNL_X32-NEXT:    kshiftrw $6, %k6, %k6
 ; KNL_X32-NEXT:    korw %k6, %k5, %k5
 ; KNL_X32-NEXT:    kandw %k4, %k5, %k4
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k5
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k5
 ; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
 ; KNL_X32-NEXT:    kshiftrw $5, %k5, %k5
 ; KNL_X32-NEXT:    korw %k5, %k4, %k4
 ; KNL_X32-NEXT:    kandw %k3, %k4, %k3
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k4
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k4
 ; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL_X32-NEXT:    kshiftrw $4, %k4, %k4
 ; KNL_X32-NEXT:    korw %k4, %k3, %k3
 ; KNL_X32-NEXT:    kandw %k2, %k3, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k3
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k3
 ; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL_X32-NEXT:    kshiftrw $3, %k3, %k3
 ; KNL_X32-NEXT:    korw %k3, %k2, %k2
 ; KNL_X32-NEXT:    kandw %k1, %k2, %k1
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $2, %k2, %k2
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
-; KNL_X32-NEXT:    xorl %eax, %eax
-; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    movl $65535, %ecx ## imm = 0xFFFF
-; KNL_X32-NEXT:    movl $0, %edx
-; KNL_X32-NEXT:    cmovnel %ecx, %edx
 ; KNL_X32-NEXT:    kandw %k0, %k1, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %bl
-; KNL_X32-NEXT:    kmovw %ebx, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $14, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
 ; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL_X32-NEXT:    kshiftrw $1, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %bl
-; KNL_X32-NEXT:    kmovw %ebx, %k1
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; KNL_X32-NEXT:    kmovw %edx, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw %edx, %k1
-; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    cmovnel %ecx, %eax
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL_X32-NEXT:    kmovw %ecx, %k1
+; KNL_X32-NEXT:    kmovw (%esp), %k2 ## 2-byte Reload
 ; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kandw %k1, %k2, %k1
@@ -1640,7 +1630,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    orl %ebx, %ecx
 ; KNL_X32-NEXT:    orl %edx, %ecx
 ; KNL_X32-NEXT:    movw %cx, (%eax)
-; KNL_X32-NEXT:    addl $20, %esp
+; KNL_X32-NEXT:    addl $16, %esp
 ; KNL_X32-NEXT:    popl %esi
 ; KNL_X32-NEXT:    popl %edi
 ; KNL_X32-NEXT:    popl %ebx
@@ -1655,16 +1645,14 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    pushq %r13
 ; FASTISEL-NEXT:    pushq %r12
 ; FASTISEL-NEXT:    pushq %rbx
-; FASTISEL-NEXT:    movq %rdi, %rax
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; FASTISEL-NEXT:    movq %rdi, %rax
+; FASTISEL-NEXT:    kshiftld $31, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    movl $-3, %edi
-; FASTISEL-NEXT:    kmovd %edi, %k2
-; FASTISEL-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; FASTISEL-NEXT:    kandd %k2, %k0, %k0
+; FASTISEL-NEXT:    kshiftrd $30, %k0, %k0
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
-; FASTISEL-NEXT:    kshiftrd $30, %k1, %k1
-; FASTISEL-NEXT:    kord %k1, %k0, %k0
+; FASTISEL-NEXT:    kshiftrd $31, %k1, %k1
+; FASTISEL-NEXT:    kord %k0, %k1, %k0
 ; FASTISEL-NEXT:    movl $-5, %edi
 ; FASTISEL-NEXT:    kmovd %edi, %k1
 ; FASTISEL-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
@@ -1681,10 +1669,10 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $28, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    movl $-17, %edi
 ; FASTISEL-NEXT:    kmovd %edi, %k2
 ; FASTISEL-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kandd %k2, %k0, %k0
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $27, %k1, %k1
@@ -1705,10 +1693,10 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $25, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    movl $-129, %edi
 ; FASTISEL-NEXT:    kmovd %edi, %k2
 ; FASTISEL-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kandd %k2, %k0, %k0
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $24, %k1, %k1
@@ -1729,10 +1717,11 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $22, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    movl $-1025, %edi ## imm = 0xFBFF
-; FASTISEL-NEXT:    kmovd %edi, %k6
-; FASTISEL-NEXT:    kandd %k6, %k0, %k0
+; FASTISEL-NEXT:    kmovd %edi, %k2
+; FASTISEL-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kandd %k2, %k0, %k0
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $21, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
@@ -1745,23 +1734,22 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kshiftrd $20, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
 ; FASTISEL-NEXT:    movl $-4097, %edi ## imm = 0xEFFF
-; FASTISEL-NEXT:    kmovd %edi, %k1
-; FASTISEL-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; FASTISEL-NEXT:    kandd %k1, %k0, %k0
+; FASTISEL-NEXT:    kmovd %edi, %k6
+; FASTISEL-NEXT:    kandd %k6, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $19, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    movl $-8193, %edi ## imm = 0xDFFF
-; FASTISEL-NEXT:    kmovd %edi, %k4
-; FASTISEL-NEXT:    kandd %k4, %k0, %k0
+; FASTISEL-NEXT:    kmovd %edi, %k5
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kandd %k5, %k0, %k0
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $18, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
 ; FASTISEL-NEXT:    movl $-16385, %edi ## imm = 0xBFFF
-; FASTISEL-NEXT:    kmovd %edi, %k5
-; FASTISEL-NEXT:    kandd %k5, %k0, %k0
+; FASTISEL-NEXT:    kmovd %edi, %k4
+; FASTISEL-NEXT:    kandd %k4, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $17, %k1, %k1
@@ -1773,21 +1761,21 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kshiftld $31, %k7, %k7
 ; FASTISEL-NEXT:    kshiftrd $16, %k7, %k7
 ; FASTISEL-NEXT:    kord %k7, %k0, %k7
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
 ; FASTISEL-NEXT:    movl $-65537, %edi ## imm = 0xFFFEFFFF
 ; FASTISEL-NEXT:    kmovd %edi, %k2
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
 ; FASTISEL-NEXT:    kandd %k2, %k7, %k7
 ; FASTISEL-NEXT:    kshiftld $31, %k0, %k0
 ; FASTISEL-NEXT:    kshiftrd $15, %k0, %k0
 ; FASTISEL-NEXT:    kord %k0, %k7, %k0
 ; FASTISEL-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; FASTISEL-NEXT:    kmovd %esi, %k0
-; FASTISEL-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT:    kandd %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovd %edx, %k7
+; FASTISEL-NEXT:    kmovd %edx, %k0
+; FASTISEL-NEXT:    kshiftld $31, %k0, %k0
+; FASTISEL-NEXT:    kshiftrd $30, %k0, %k0
+; FASTISEL-NEXT:    kmovd %esi, %k7
 ; FASTISEL-NEXT:    kshiftld $31, %k7, %k7
-; FASTISEL-NEXT:    kshiftrd $30, %k7, %k7
-; FASTISEL-NEXT:    kord %k7, %k0, %k0
+; FASTISEL-NEXT:    kshiftrd $31, %k7, %k7
+; FASTISEL-NEXT:    kord %k0, %k7, %k0
 ; FASTISEL-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
 ; FASTISEL-NEXT:    kandd %k1, %k0, %k0
 ; FASTISEL-NEXT:    kmovd %ecx, %k7
@@ -1836,29 +1824,29 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $22, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
-; FASTISEL-NEXT:    kandd %k6, %k0, %k0
+; FASTISEL-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
+; FASTISEL-NEXT:    kandd %k1, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $21, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 4-byte Reload
-; FASTISEL-NEXT:    kandd %k6, %k0, %k0
+; FASTISEL-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload
+; FASTISEL-NEXT:    kandd %k7, %k0, %k0
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $20, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; FASTISEL-NEXT:    kandd %k1, %k0, %k0
-; FASTISEL-NEXT:    kshiftld $31, %k6, %k1
+; FASTISEL-NEXT:    kandd %k6, %k0, %k0
+; FASTISEL-NEXT:    kshiftld $31, %k7, %k1
 ; FASTISEL-NEXT:    kshiftrd $19, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
-; FASTISEL-NEXT:    kandd %k4, %k0, %k0
+; FASTISEL-NEXT:    kandd %k5, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $18, %k1, %k1
 ; FASTISEL-NEXT:    kord %k1, %k0, %k0
-; FASTISEL-NEXT:    kandd %k5, %k0, %k0
+; FASTISEL-NEXT:    kandd %k4, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftld $31, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrd $17, %k1, %k1
@@ -1969,11 +1957,9 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL-LABEL: test17:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    movw $-3, %di
-; KNL-NEXT:    kmovw %edi, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -1983,343 +1969,343 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kshiftlw $15, %k2, %k2
+; KNL-NEXT:    kshiftrw $13, %k2, %k2
+; KNL-NEXT:    korw %k2, %k0, %k0
+; KNL-NEXT:    movw $-9, %di
+; KNL-NEXT:    kmovw %edi, %k2
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $13, %k3, %k3
+; KNL-NEXT:    kshiftrw $12, %k3, %k3
 ; KNL-NEXT:    korw %k3, %k0, %k0
-; KNL-NEXT:    movw $-9, %di
+; KNL-NEXT:    movw $-17, %di
 ; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $12, %k4, %k4
+; KNL-NEXT:    kshiftrw $11, %k4, %k4
 ; KNL-NEXT:    korw %k4, %k0, %k0
-; KNL-NEXT:    movw $-17, %di
+; KNL-NEXT:    movw $-33, %di
 ; KNL-NEXT:    kmovw %edi, %k4
 ; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k5
 ; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $11, %k5, %k5
+; KNL-NEXT:    kshiftrw $10, %k5, %k5
 ; KNL-NEXT:    korw %k5, %k0, %k0
-; KNL-NEXT:    movw $-33, %di
+; KNL-NEXT:    movw $-65, %di
 ; KNL-NEXT:    kmovw %edi, %k5
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k6
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
 ; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    movw $-65, %di
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; KNL-NEXT:    andl $1, %r10d
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kmovw %r10d, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; KNL-NEXT:    andl $1, %r10d
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kmovw %r10d, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; KNL-NEXT:    andl $1, %r10d
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kmovw %r10d, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; KNL-NEXT:    andl $1, %r10d
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kmovw %r10d, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; KNL-NEXT:    andl $1, %r10d
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kmovw %r10d, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; KNL-NEXT:    kmovw %edi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    andl $1, %esi
+; KNL-NEXT:    kmovw %edx, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kmovw %esi, %k7
+; KNL-NEXT:    korw %k0, %k7, %k0
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %ecx, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $13, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k3, %k0, %k0
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    kmovw %r8d, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $12, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
+; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    kmovw %r9d, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $11, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $10, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $9, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kandw %k2, %k7, %k2
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k7
+; KNL-NEXT:    andl $1, %ecx
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; KNL-NEXT:    kmovw %edx, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k2, %k2
-; KNL-NEXT:    kandw %k1, %k2, %k1
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    korw %k7, %k6, %k6
+; KNL-NEXT:    kandw %k1, %k6, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kandw %k3, %k1, %k1
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kandw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $12, %k2, %k2
 ; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kandw %k4, %k1, %k1
+; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $11, %k2, %k2
 ; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kandw %k5, %k1, %k1
+; KNL-NEXT:    kandw %k4, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    kshiftrw $10, %k2, %k2
 ; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kandw %k6, %k1, %k1
+; KNL-NEXT:    kandw %k5, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; KNL-NEXT:    kmovw %ecx, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
@@ -2378,84 +2364,83 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; SKX-LABEL: test17:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movq %rdi, %rax
-; SKX-NEXT:    movb $-3, %dil
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $7, %k1, %k1
+; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    movb $-5, %dil
 ; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
 ; SKX-NEXT:    kandb %k1, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    kshiftrb $5, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    movb $-5, %dil
+; SKX-NEXT:    movb $-9, %dil
 ; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; SKX-NEXT:    kandb %k1, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $5, %k2, %k2
+; SKX-NEXT:    kshiftrb $4, %k2, %k2
 ; SKX-NEXT:    korb %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; SKX-NEXT:    movb $-9, %dil
-; SKX-NEXT:    kmovd %edi, %k7
-; SKX-NEXT:    kandb %k7, %k0, %k0
-; SKX-NEXT:    kshiftlb $7, %k3, %k3
-; SKX-NEXT:    kshiftrb $4, %k3, %k3
-; SKX-NEXT:    korb %k3, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    movb $-17, %dil
+; SKX-NEXT:    kmovd %edi, %k3
+; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k2, %k2
+; SKX-NEXT:    kshiftrb $3, %k2, %k2
+; SKX-NEXT:    korb %k2, %k0, %k0
+; SKX-NEXT:    movb $-33, %dil
 ; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; SKX-NEXT:    kandb %k1, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
 ; SKX-NEXT:    kshiftlb $7, %k4, %k4
-; SKX-NEXT:    kshiftrb $3, %k4, %k4
+; SKX-NEXT:    kshiftrb $2, %k4, %k4
 ; SKX-NEXT:    korb %k4, %k0, %k0
-; SKX-NEXT:    movb $-33, %dil
-; SKX-NEXT:    kmovd %edi, %k4
-; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    movb $-65, %dil
+; SKX-NEXT:    kmovd %edi, %k6
+; SKX-NEXT:    kandb %k6, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kshiftlb $7, %k5, %k5
-; SKX-NEXT:    kshiftrb $2, %k5, %k5
+; SKX-NEXT:    kshiftrb $1, %k5, %k5
 ; SKX-NEXT:    korb %k5, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    movb $-65, %dil
-; SKX-NEXT:    kmovd %edi, %k5
-; SKX-NEXT:    kandb %k5, %k0, %k1
-; SKX-NEXT:    kshiftlb $7, %k6, %k6
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT:    kshiftrb $1, %k6, %k6
-; SKX-NEXT:    korb %k6, %k1, %k1
-; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; SKX-NEXT:    kandb %k6, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $6, %k1, %k1
-; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
-; SKX-NEXT:    kandb %k3, %k0, %k2
-; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $5, %k1, %k1
+; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT:    korb %k1, %k2, %k1
-; SKX-NEXT:    kandb %k7, %k1, %k1
 ; SKX-NEXT:    kshiftlb $7, %k0, %k0
-; SKX-NEXT:    kshiftrb $4, %k0, %k0
-; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT:    kshiftlb $7, %k7, %k7
+; SKX-NEXT:    kshiftrb $7, %k7, %k7
+; SKX-NEXT:    korb %k0, %k7, %k0
 ; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
 ; SKX-NEXT:    kandb %k2, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $3, %k1, %k1
-; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT:    kshiftlb $7, %k7, %k7
+; SKX-NEXT:    kshiftrb $5, %k7, %k7
+; SKX-NEXT:    korb %k7, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; SKX-NEXT:    kandb %k5, %k0, %k1
+; SKX-NEXT:    kshiftlb $7, %k7, %k7
+; SKX-NEXT:    kshiftrb $4, %k7, %k7
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    korb %k7, %k1, %k1
+; SKX-NEXT:    kandb %k3, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $3, %k0, %k0
+; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
 ; SKX-NEXT:    kandb %k4, %k0, %k0
-; SKX-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $2, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kandb %k6, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $1, %k1, %k1
@@ -2464,128 +2449,128 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; SKX-NEXT:    kandb %k1, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT:    kandb %k6, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $6, %k1, %k1
-; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kshiftrb $7, %k1, %k1
+; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    kandb %k2, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kandb %k7, %k0, %k0
+; SKX-NEXT:    kmovq %k5, %k7
+; SKX-NEXT:    kandb %k5, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $4, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kmovq %k2, %k3
-; SKX-NEXT:    kandb %k2, %k0, %k0
+; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $3, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kandb %k4, %k0, %k0
-; SKX-NEXT:    kshiftlb $7, %k2, %k1
+; SKX-NEXT:    kshiftlb $7, %k5, %k1
 ; SKX-NEXT:    kshiftrb $2, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kandb %k6, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $1, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kmovq %k6, %k0
-; SKX-NEXT:    kandb %k6, %k1, %k1
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
-; SKX-NEXT:    kandb %k4, %k1, %k1
-; SKX-NEXT:    kshiftlb $7, %k6, %k2
-; SKX-NEXT:    kshiftrb $5, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $7, %k5, %k5
+; SKX-NEXT:    korb %k1, %k5, %k1
+; SKX-NEXT:    kandb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $5, %k5, %k5
+; SKX-NEXT:    korb %k5, %k1, %k1
 ; SKX-NEXT:    kandb %k7, %k1, %k1
-; SKX-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $4, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $4, %k5, %k5
+; SKX-NEXT:    korb %k5, %k1, %k1
 ; SKX-NEXT:    kandb %k3, %k1, %k1
-; SKX-NEXT:    kmovq %k3, %k6
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $3, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; SKX-NEXT:    kandb %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $2, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kandb %k5, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $1, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kandb %k0, %k1, %k1
-; SKX-NEXT:    kmovq %k0, %k3
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $3, %k5, %k5
+; SKX-NEXT:    korb %k5, %k1, %k1
 ; SKX-NEXT:    kandb %k4, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $5, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kandb %k7, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $4, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $2, %k5, %k5
+; SKX-NEXT:    korb %k5, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
 ; SKX-NEXT:    kandb %k6, %k1, %k1
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $3, %k2, %k2
+; SKX-NEXT:    kshiftlb $7, %k5, %k5
+; SKX-NEXT:    kshiftrb $1, %k5, %k5
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT:    korb %k5, %k1, %k5
+; SKX-NEXT:    kshiftlb $7, %k7, %k1
+; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT:    kshiftlb $7, %k7, %k7
+; SKX-NEXT:    kshiftrb $7, %k7, %k7
+; SKX-NEXT:    korb %k1, %k7, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT:    kandb %k2, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k7, %k7
+; SKX-NEXT:    kshiftrb $5, %k7, %k7
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; SKX-NEXT:    kandb %k7, %k1, %k1
+; SKX-NEXT:    korb %k7, %k1, %k1
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; SKX-NEXT:    kandb %k3, %k1, %k1
 ; SKX-NEXT:    kshiftlb $7, %k0, %k0
-; SKX-NEXT:    kshiftrb $2, %k0, %k0
+; SKX-NEXT:    kshiftrb $4, %k0, %k0
 ; SKX-NEXT:    korb %k0, %k1, %k0
-; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; SKX-NEXT:    kandb %k6, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $1, %k1, %k1
+; SKX-NEXT:    kshiftrb $3, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; SKX-NEXT:    kandb %k1, %k0, %k0
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; SKX-NEXT:    kandb %k1, %k0, %k0
-; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kandb %k4, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    kshiftrb $2, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; SKX-NEXT:    kandb %k2, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $5, %k1, %k1
+; SKX-NEXT:    kshiftrb $1, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
-; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kshiftlb $7, %k1, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $7, %k1, %k1
+; SKX-NEXT:    korb %k0, %k1, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; SKX-NEXT:    kandb %k5, %k0, %k0
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $5, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k7, %k1
 ; SKX-NEXT:    kshiftrb $4, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kandb %k6, %k0, %k0
@@ -2593,60 +2578,28 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $3, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    kmovq %k4, %k7
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kandb %k7, %k0, %k0
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; SKX-NEXT:    kandb %k5, %k0, %k0
-; SKX-NEXT:    kshiftlb $7, %k2, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; SKX-NEXT:    kandb %k2, %k0, %k0
+; SKX-NEXT:    kmovq %k2, %k3
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $1, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kandb %k3, %k2, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; SKX-NEXT:    kandb %k6, %k1, %k1
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $5, %k2, %k2
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kandb %k4, %k1, %k1
-; SKX-NEXT:    kshiftlb $7, %k6, %k2
-; SKX-NEXT:    kshiftrb $4, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; SKX-NEXT:    kandb %k6, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $3, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kandb %k7, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $2, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kandb %k5, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $1, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kandb %k0, %k1, %k0
 ; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $6, %k1, %k1
-; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
-; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kshiftrb $7, %k1, %k1
+; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    kmovq %k5, %k4
+; SKX-NEXT:    kandb %k5, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
@@ -2658,34 +2611,73 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; SKX-NEXT:    kshiftrb $4, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kandb %k6, %k0, %k2
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $3, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    korb %k1, %k2, %k1
+; SKX-NEXT:    kmovq %k7, %k2
+; SKX-NEXT:    kandb %k7, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $2, %k0, %k0
+; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    kandb %k3, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $1, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; SKX-NEXT:    kandb %k1, %k0, %k0
+; SKX-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $7, %k1, %k1
+; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    kandb %k4, %k0, %k0
+; SKX-NEXT:    kmovq %k4, %k7
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $5, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovq %k5, %k3
+; SKX-NEXT:    kandb %k5, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $4, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kandb %k6, %k0, %k0
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $3, %k1, %k1
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kandb %k7, %k0, %k0
-; SKX-NEXT:    kshiftlb $7, %k2, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; SKX-NEXT:    kandb %k2, %k0, %k0
+; SKX-NEXT:    kmovq %k2, %k5
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $2, %k1, %k1
+; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
 ; SKX-NEXT:    kandb %k4, %k0, %k0
-; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k2, %k1
 ; SKX-NEXT:    kshiftrb $1, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
-; SKX-NEXT:    kmovd %esi, %k1
-; SKX-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; SKX-NEXT:    kandb %k2, %k1, %k1
-; SKX-NEXT:    kmovd %edx, %k2
+; SKX-NEXT:    kmovd %edx, %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftrb $6, %k1, %k1
+; SKX-NEXT:    kmovd %esi, %k2
 ; SKX-NEXT:    kshiftlb $7, %k2, %k2
-; SKX-NEXT:    kshiftrb $6, %k2, %k2
-; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kandb %k3, %k1, %k1
+; SKX-NEXT:    kshiftrb $7, %k2, %k2
+; SKX-NEXT:    korb %k1, %k2, %k1
+; SKX-NEXT:    kandb %k7, %k1, %k1
 ; SKX-NEXT:    kmovd %ecx, %k2
 ; SKX-NEXT:    kshiftlb $7, %k2, %k2
 ; SKX-NEXT:    kshiftrb $5, %k2, %k2
 ; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kandb %k5, %k1, %k1
+; SKX-NEXT:    kandb %k3, %k1, %k1
 ; SKX-NEXT:    kmovd %r8d, %k2
 ; SKX-NEXT:    kshiftlb $7, %k2, %k2
 ; SKX-NEXT:    kshiftrb $4, %k2, %k2
@@ -2696,7 +2688,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; SKX-NEXT:    kshiftrb $3, %k2, %k2
 ; SKX-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; SKX-NEXT:    korb %k2, %k1, %k1
-; SKX-NEXT:    kandb %k7, %k1, %k1
+; SKX-NEXT:    kandb %k5, %k1, %k1
 ; SKX-NEXT:    kshiftlb $7, %k3, %k2
 ; SKX-NEXT:    kshiftrb $2, %k2, %k2
 ; SKX-NEXT:    korb %k2, %k1, %k1
@@ -2751,11 +2743,9 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL_X32:       ## %bb.0:
 ; KNL_X32-NEXT:    pushl %ebx
 ; KNL_X32-NEXT:    subl $16, %esp
-; KNL_X32-NEXT:    movw $-3, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    andl $1, %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
@@ -2765,348 +2755,348 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL_X32-NEXT:    kmovw %eax, %k1
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
+; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
+; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    movw $-9, %ax
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k3
 ; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
-; KNL_X32-NEXT:    kshiftrw $13, %k3, %k3
+; KNL_X32-NEXT:    kshiftrw $12, %k3, %k3
 ; KNL_X32-NEXT:    korw %k3, %k0, %k0
-; KNL_X32-NEXT:    movw $-9, %ax
+; KNL_X32-NEXT:    movw $-17, %ax
 ; KNL_X32-NEXT:    kmovw %eax, %k3
 ; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k4
 ; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
-; KNL_X32-NEXT:    kshiftrw $12, %k4, %k4
+; KNL_X32-NEXT:    kshiftrw $11, %k4, %k4
 ; KNL_X32-NEXT:    korw %k4, %k0, %k0
-; KNL_X32-NEXT:    movw $-17, %ax
+; KNL_X32-NEXT:    movw $-33, %ax
 ; KNL_X32-NEXT:    kmovw %eax, %k4
 ; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k5
 ; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
-; KNL_X32-NEXT:    kshiftrw $11, %k5, %k5
+; KNL_X32-NEXT:    kshiftrw $10, %k5, %k5
 ; KNL_X32-NEXT:    korw %k5, %k0, %k0
-; KNL_X32-NEXT:    movw $-33, %ax
+; KNL_X32-NEXT:    movw $-65, %ax
 ; KNL_X32-NEXT:    kmovw %eax, %k5
 ; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k6
 ; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
 ; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    movw $-65, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kandw %k6, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    andl $1, %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $14, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k6, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    andl $1, %eax
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; KNL_X32-NEXT:    kmovw %ecx, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k6, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    andl $1, %eax
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; KNL_X32-NEXT:    kmovw %ecx, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k6, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    andl $1, %eax
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; KNL_X32-NEXT:    kmovw %ecx, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k6, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    andl $1, %eax
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; KNL_X32-NEXT:    kmovw %ecx, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k6, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    andl $1, %eax
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; KNL_X32-NEXT:    kmovw %ecx, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    korw %k0, %k6, %k0
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k6, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; KNL_X32-NEXT:    andl $1, %eax
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; KNL_X32-NEXT:    kmovw %ecx, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
-; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    korw %k0, %k7, %k0
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k6, %k0, %k0
+; KNL_X32-NEXT:    kandw %k5, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
 ; KNL_X32-NEXT:    korw %k7, %k0, %k0
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    kandw %k2, %k7, %k2
-; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    andl $1, %eax
+; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; KNL_X32-NEXT:    kmovw %ecx, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k2, %k2
-; KNL_X32-NEXT:    kandw %k1, %k2, %k1
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k1, %k6, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k1, %k1
-; KNL_X32-NEXT:    kandw %k3, %k1, %k1
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
+; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
+; KNL_X32-NEXT:    korw %k6, %k1, %k1
+; KNL_X32-NEXT:    kandw %k2, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
-; KNL_X32-NEXT:    kandw %k4, %k1, %k1
+; KNL_X32-NEXT:    kandw %k3, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
-; KNL_X32-NEXT:    kandw %k5, %k1, %k1
+; KNL_X32-NEXT:    kandw %k4, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
 ; KNL_X32-NEXT:    korw %k2, %k1, %k1
-; KNL_X32-NEXT:    kandw %k6, %k1, %k1
+; KNL_X32-NEXT:    kandw %k5, %k1, %k1
 ; KNL_X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
@@ -3168,123 +3158,120 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; FASTISEL-LABEL: test17:
 ; FASTISEL:       ## %bb.0:
 ; FASTISEL-NEXT:    movq %rdi, %rax
-; FASTISEL-NEXT:    movb $-3, %dil
-; FASTISEL-NEXT:    kmovd %edi, %k1
-; FASTISEL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; FASTISEL-NEXT:    kandb %k1, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kshiftlb $7, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $6, %k0, %k0
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $6, %k1, %k1
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $7, %k1, %k1
+; FASTISEL-NEXT:    korb %k0, %k1, %k0
 ; FASTISEL-NEXT:    movb $-5, %dil
+; FASTISEL-NEXT:    kmovd %edi, %k3
+; FASTISEL-NEXT:    kandb %k3, %k0, %k0
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $5, %k1, %k1
+; FASTISEL-NEXT:    korb %k1, %k0, %k0
+; FASTISEL-NEXT:    movb $-9, %dil
 ; FASTISEL-NEXT:    kmovd %edi, %k1
 ; FASTISEL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; FASTISEL-NEXT:    kandb %k1, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
-; FASTISEL-NEXT:    kshiftrb $5, %k2, %k2
+; FASTISEL-NEXT:    kshiftrb $4, %k2, %k2
 ; FASTISEL-NEXT:    korb %k2, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
-; FASTISEL-NEXT:    movb $-9, %dil
-; FASTISEL-NEXT:    kmovd %edi, %k1
-; FASTISEL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; FASTISEL-NEXT:    kandb %k1, %k0, %k0
-; FASTISEL-NEXT:    kshiftlb $7, %k3, %k3
-; FASTISEL-NEXT:    kshiftrb $4, %k3, %k3
-; FASTISEL-NEXT:    korb %k3, %k0, %k0
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; FASTISEL-NEXT:    movb $-17, %dil
 ; FASTISEL-NEXT:    kmovd %edi, %k1
 ; FASTISEL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; FASTISEL-NEXT:    kandb %k1, %k0, %k0
+; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
+; FASTISEL-NEXT:    kshiftrb $3, %k2, %k2
+; FASTISEL-NEXT:    korb %k2, %k0, %k0
+; FASTISEL-NEXT:    movb $-33, %dil
+; FASTISEL-NEXT:    kmovd %edi, %k5
+; FASTISEL-NEXT:    kandb %k5, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k4
 ; FASTISEL-NEXT:    kshiftlb $7, %k4, %k4
-; FASTISEL-NEXT:    kshiftrb $3, %k4, %k4
+; FASTISEL-NEXT:    kshiftrb $2, %k4, %k4
 ; FASTISEL-NEXT:    korb %k4, %k0, %k0
-; FASTISEL-NEXT:    movb $-33, %dil
+; FASTISEL-NEXT:    movb $-65, %dil
 ; FASTISEL-NEXT:    kmovd %edi, %k1
 ; FASTISEL-NEXT:    kandb %k1, %k0, %k0
 ; FASTISEL-NEXT:    kmovq %k1, %k4
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k5
-; FASTISEL-NEXT:    kshiftlb $7, %k5, %k5
-; FASTISEL-NEXT:    kshiftrb $2, %k5, %k5
-; FASTISEL-NEXT:    korb %k5, %k0, %k0
+; FASTISEL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; FASTISEL-NEXT:    kshiftlb $7, %k6, %k6
+; FASTISEL-NEXT:    kshiftrb $1, %k6, %k6
+; FASTISEL-NEXT:    korb %k6, %k0, %k1
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; FASTISEL-NEXT:    kshiftlb $7, %k6, %k6
+; FASTISEL-NEXT:    kshiftrb $6, %k6, %k6
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
-; FASTISEL-NEXT:    movb $-65, %dil
-; FASTISEL-NEXT:    kmovd %edi, %k6
-; FASTISEL-NEXT:    kandb %k6, %k0, %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k7, %k7
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; FASTISEL-NEXT:    kshiftrb $1, %k7, %k7
-; FASTISEL-NEXT:    korb %k7, %k1, %k7
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k3, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $6, %k1, %k1
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kshiftrb $7, %k7, %k7
+; FASTISEL-NEXT:    korb %k6, %k7, %k6
+; FASTISEL-NEXT:    kandb %k3, %k6, %k6
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; FASTISEL-NEXT:    kshiftlb $7, %k7, %k7
+; FASTISEL-NEXT:    kshiftrb $5, %k7, %k7
+; FASTISEL-NEXT:    korb %k7, %k6, %k6
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
 ; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k2, %k0, %k2
-; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $5, %k1, %k1
+; FASTISEL-NEXT:    kandb %k2, %k6, %k6
+; FASTISEL-NEXT:    kshiftlb $7, %k7, %k7
+; FASTISEL-NEXT:    kshiftrb $4, %k7, %k7
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; FASTISEL-NEXT:    korb %k1, %k2, %k1
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k5, %k1, %k1
+; FASTISEL-NEXT:    korb %k7, %k6, %k6
+; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; FASTISEL-NEXT:    kandb %k7, %k6, %k6
 ; FASTISEL-NEXT:    kshiftlb $7, %k0, %k0
-; FASTISEL-NEXT:    kshiftrb $4, %k0, %k0
-; FASTISEL-NEXT:    korb %k0, %k1, %k0
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k2, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $3, %k1, %k1
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $3, %k0, %k0
+; FASTISEL-NEXT:    korb %k0, %k6, %k0
+; FASTISEL-NEXT:    kandb %k5, %k0, %k0
+; FASTISEL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; FASTISEL-NEXT:    kshiftlb $7, %k6, %k6
+; FASTISEL-NEXT:    kshiftrb $2, %k6, %k6
+; FASTISEL-NEXT:    korb %k6, %k0, %k0
 ; FASTISEL-NEXT:    kandb %k4, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $2, %k1, %k1
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k6, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $1, %k1, %k1
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k7, %k0, %k0
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
+; FASTISEL-NEXT:    kshiftlb $7, %k6, %k6
+; FASTISEL-NEXT:    kshiftrb $1, %k6, %k6
+; FASTISEL-NEXT:    korb %k6, %k0, %k0
+; FASTISEL-NEXT:    kandb %k1, %k0, %k0
 ; FASTISEL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; FASTISEL-NEXT:    kandb %k3, %k0, %k0
+; FASTISEL-NEXT:    kshiftlb $7, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $6, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $6, %k1, %k1
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k7, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $7, %k1, %k1
+; FASTISEL-NEXT:    korb %k0, %k1, %k0
+; FASTISEL-NEXT:    kmovq %k3, %k7
+; FASTISEL-NEXT:    kandb %k3, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $5, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k5, %k0, %k0
+; FASTISEL-NEXT:    kandb %k2, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $4, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    kmovq %k2, %k3
-; FASTISEL-NEXT:    kandb %k2, %k0, %k0
+; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; FASTISEL-NEXT:    kandb %k4, %k0, %k0
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $3, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k6
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k4, %k0, %k0
-; FASTISEL-NEXT:    kmovq %k4, %k5
-; FASTISEL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; FASTISEL-NEXT:    kshiftlb $7, %k2, %k1
+; FASTISEL-NEXT:    kandb %k5, %k0, %k0
+; FASTISEL-NEXT:    kshiftlb $7, %k6, %k1
 ; FASTISEL-NEXT:    kshiftrb $2, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovq %k6, %k4
-; FASTISEL-NEXT:    kandb %k6, %k0, %k0
+; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; FASTISEL-NEXT:    kandb %k3, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $1, %k1, %k1
@@ -3292,141 +3279,141 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; FASTISEL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k6, %k0, %k0
+; FASTISEL-NEXT:    kshiftlb $7, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $6, %k0, %k0
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $6, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $7, %k1, %k1
+; FASTISEL-NEXT:    korb %k0, %k1, %k0
 ; FASTISEL-NEXT:    kandb %k7, %k0, %k0
-; FASTISEL-NEXT:    kshiftlb $7, %k2, %k1
+; FASTISEL-NEXT:    kmovq %k7, %k5
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $5, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k7, %k0, %k0
+; FASTISEL-NEXT:    kandb %k2, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $4, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k3, %k0, %k0
+; FASTISEL-NEXT:    kandb %k4, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $3, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k5, %k0, %k0
+; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; FASTISEL-NEXT:    kandb %k6, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $2, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k4, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kandb %k3, %k0, %k0
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $1, %k1, %k1
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    kandb %k6, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
-; FASTISEL-NEXT:    kshiftrb $6, %k2, %k2
-; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; FASTISEL-NEXT:    korb %k1, %k0, %k2
+; FASTISEL-NEXT:    kshiftlb $7, %k7, %k1
+; FASTISEL-NEXT:    kshiftrb $6, %k1, %k1
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
+; FASTISEL-NEXT:    kshiftlb $7, %k7, %k7
+; FASTISEL-NEXT:    kshiftrb $7, %k7, %k7
+; FASTISEL-NEXT:    korb %k1, %k7, %k1
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
 ; FASTISEL-NEXT:    kandb %k5, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
-; FASTISEL-NEXT:    kshiftrb $5, %k2, %k2
-; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kandb %k7, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
-; FASTISEL-NEXT:    kshiftrb $4, %k2, %k2
-; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; FASTISEL-NEXT:    kandb %k3, %k1, %k1
-; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
-; FASTISEL-NEXT:    kshiftrb $3, %k2, %k2
+; FASTISEL-NEXT:    kshiftlb $7, %k7, %k7
+; FASTISEL-NEXT:    kshiftrb $5, %k7, %k7
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k2, %k1, %k1
+; FASTISEL-NEXT:    korb %k7, %k1, %k1
+; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; FASTISEL-NEXT:    kandb %k7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k0, %k0
-; FASTISEL-NEXT:    kshiftrb $2, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $4, %k0, %k0
 ; FASTISEL-NEXT:    korb %k0, %k1, %k0
-; FASTISEL-NEXT:    kmovq %k4, %k3
-; FASTISEL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; FASTISEL-NEXT:    kandb %k4, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $1, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $3, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; FASTISEL-NEXT:    kmovq %k6, %k4
 ; FASTISEL-NEXT:    kandb %k6, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $6, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $2, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k5, %k0, %k0
+; FASTISEL-NEXT:    kandb %k3, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $5, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $1, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k7, %k0, %k0
+; FASTISEL-NEXT:    kandb %k2, %k0, %k0
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; FASTISEL-NEXT:    kandb %k2, %k0, %k0
+; FASTISEL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; FASTISEL-NEXT:    kshiftlb $7, %k1, %k0
+; FASTISEL-NEXT:    kshiftrb $6, %k0, %k0
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $7, %k1, %k1
+; FASTISEL-NEXT:    korb %k0, %k1, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kmovq %k5, %k3
+; FASTISEL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; FASTISEL-NEXT:    kandb %k5, %k0, %k0
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $5, %k1, %k1
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; FASTISEL-NEXT:    korb %k1, %k0, %k0
+; FASTISEL-NEXT:    kandb %k7, %k0, %k0
+; FASTISEL-NEXT:    kshiftlb $7, %k2, %k1
 ; FASTISEL-NEXT:    kshiftrb $4, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k6, %k0, %k0
+; FASTISEL-NEXT:    kmovq %k4, %k5
+; FASTISEL-NEXT:    kandb %k4, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $3, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
+; FASTISEL-NEXT:    kandb %k6, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
-; FASTISEL-NEXT:    kandb %k2, %k0, %k0
-; FASTISEL-NEXT:    kmovq %k2, %k7
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $2, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k3, %k0, %k0
-; FASTISEL-NEXT:    kshiftlb $7, %k2, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; FASTISEL-NEXT:    kandb %k4, %k0, %k0
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $1, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k4, %k2, %k1
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $6, %k1, %k1
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
-; FASTISEL-NEXT:    kshiftrb $6, %k2, %k2
-; FASTISEL-NEXT:    korb %k2, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $7, %k2, %k2
+; FASTISEL-NEXT:    korb %k1, %k2, %k1
+; FASTISEL-NEXT:    kandb %k3, %k1, %k1
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; FASTISEL-NEXT:    kandb %k5, %k1, %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
 ; FASTISEL-NEXT:    kshiftrb $5, %k2, %k2
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k4, %k1, %k1
-; FASTISEL-NEXT:    kshiftlb $7, %k3, %k2
+; FASTISEL-NEXT:    kandb %k7, %k1, %k1
+; FASTISEL-NEXT:    kmovq %k7, %k3
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
 ; FASTISEL-NEXT:    kshiftrb $4, %k2, %k2
 ; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kandb %k6, %k1, %k1
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; FASTISEL-NEXT:    kandb %k5, %k1, %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
 ; FASTISEL-NEXT:    kshiftrb $3, %k2, %k2
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k7
 ; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kandb %k7, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
+; FASTISEL-NEXT:    kandb %k6, %k1, %k1
+; FASTISEL-NEXT:    kmovq %k6, %k5
+; FASTISEL-NEXT:    kshiftlb $7, %k7, %k2
 ; FASTISEL-NEXT:    kshiftrb $2, %k2, %k2
 ; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k3, %k1, %k1
+; FASTISEL-NEXT:    kandb %k4, %k1, %k1
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
 ; FASTISEL-NEXT:    kshiftrb $1, %k2, %k2
@@ -3434,52 +3421,52 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; FASTISEL-NEXT:    kandb %k0, %k1, %k0
 ; FASTISEL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k0
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k5, %k0, %k0
+; FASTISEL-NEXT:    kshiftlb $7, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $6, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
-; FASTISEL-NEXT:    kshiftrb $6, %k1, %k1
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k1, %k0, %k0
+; FASTISEL-NEXT:    kshiftrb $7, %k1, %k1
+; FASTISEL-NEXT:    korb %k0, %k1, %k0
+; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; FASTISEL-NEXT:    kandb %k7, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $5, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k4, %k0, %k0
+; FASTISEL-NEXT:    kandb %k3, %k0, %k0
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $4, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
 ; FASTISEL-NEXT:    kandb %k6, %k0, %k0
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
 ; FASTISEL-NEXT:    kshiftrb $3, %k1, %k1
-; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
-; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k7, %k0, %k0
-; FASTISEL-NEXT:    kshiftlb $7, %k2, %k1
-; FASTISEL-NEXT:    kshiftrb $2, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kandb %k3, %k0, %k0
-; FASTISEL-NEXT:    kmovq %k3, %k7
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; FASTISEL-NEXT:    kandb %k5, %k0, %k0
 ; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $2, %k1, %k1
+; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; FASTISEL-NEXT:    korb %k1, %k0, %k0
+; FASTISEL-NEXT:    kandb %k4, %k0, %k0
+; FASTISEL-NEXT:    kshiftlb $7, %k2, %k1
 ; FASTISEL-NEXT:    kshiftrb $1, %k1, %k1
 ; FASTISEL-NEXT:    korb %k1, %k0, %k0
-; FASTISEL-NEXT:    kmovd %esi, %k1
-; FASTISEL-NEXT:    kandb %k5, %k1, %k1
-; FASTISEL-NEXT:    kmovd %edx, %k2
+; FASTISEL-NEXT:    kmovd %edx, %k1
+; FASTISEL-NEXT:    kshiftlb $7, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $6, %k1, %k1
+; FASTISEL-NEXT:    kmovd %esi, %k2
 ; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
-; FASTISEL-NEXT:    kshiftrb $6, %k2, %k2
-; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k2, %k1, %k1
+; FASTISEL-NEXT:    kshiftrb $7, %k2, %k2
+; FASTISEL-NEXT:    korb %k1, %k2, %k1
+; FASTISEL-NEXT:    kandb %k7, %k1, %k1
 ; FASTISEL-NEXT:    kmovd %ecx, %k2
 ; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
 ; FASTISEL-NEXT:    kshiftrb $5, %k2, %k2
 ; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kandb %k4, %k1, %k1
+; FASTISEL-NEXT:    kandb %k3, %k1, %k1
 ; FASTISEL-NEXT:    kmovd %r8d, %k2
 ; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
 ; FASTISEL-NEXT:    kshiftrb $4, %k2, %k2
@@ -3490,12 +3477,11 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; FASTISEL-NEXT:    kshiftrb $3, %k2, %k2
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k3
 ; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; FASTISEL-NEXT:    kandb %k2, %k1, %k1
+; FASTISEL-NEXT:    kandb %k5, %k1, %k1
 ; FASTISEL-NEXT:    kshiftlb $7, %k3, %k2
 ; FASTISEL-NEXT:    kshiftrb $2, %k2, %k2
 ; FASTISEL-NEXT:    korb %k2, %k1, %k1
-; FASTISEL-NEXT:    kandb %k7, %k1, %k1
+; FASTISEL-NEXT:    kandb %k4, %k1, %k1
 ; FASTISEL-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
 ; FASTISEL-NEXT:    kshiftlb $7, %k2, %k2
 ; FASTISEL-NEXT:    kshiftrb $1, %k2, %k2

diff  --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 6298e47442ca..1bb3d2747fb4 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -1886,13 +1886,9 @@ define void @extload_v8i64(ptr %a, ptr %res) {
 define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: test21:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    movw $-3, %ax
-; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k1, %k2
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -1900,8 +1896,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    movw $-5, %ax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k7
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -1917,9 +1914,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    kshiftrw $12, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    movw $-17, %ax
-; KNL-NEXT:    kmovw %eax, %k6
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -1928,7 +1925,7 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    movw $-33, %ax
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k1, %k3
+; KNL-NEXT:    kmovw %k1, %k2
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -1947,7 +1944,7 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    movw $-129, %ax
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k1, %k4
+; KNL-NEXT:    kmovw %k1, %k3
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -1966,7 +1963,7 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    movw $-513, %ax # imm = 0xFDFF
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k1, %k5
+; KNL-NEXT:    kmovw %k1, %k4
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -1983,9 +1980,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    kshiftrw $5, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    movw $-2049, %ax # imm = 0xF7FF
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -2014,218 +2010,216 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    kandw %k0, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
 ; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kandw %k2, %k1, %k1
-; KNL-NEXT:    kmovw %esi, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    andl $1, %edi
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k1
+; KNL-NEXT:    kmovw %edi, %k6
+; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    kandw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %edx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
 ; KNL-NEXT:    kandw %k0, %k1, %k1
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %r8d, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kandw %k2, %k1, %k1
+; KNL-NEXT:    kmovw %r9d, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
 ; KNL-NEXT:    kandw %k2, %k1, %k1
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kandw %k6, %k1, %k1
-; KNL-NEXT:    kmovw %r8d, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kandw %k3, %k1, %k1
-; KNL-NEXT:    kmovw %r9d, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; KNL-NEXT:    kandw %k6, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kandw %k4, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $8, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $8, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
 ; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $7, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kandw %k5, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $7, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kandw %k4, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $6, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $6, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
 ; KNL-NEXT:    kandw %k4, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $5, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $5, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    kandw %k5, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $4, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $4, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $3, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $3, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $2, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $2, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
 ; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    korw %k1, %k6, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k1, %k1
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kandw %k7, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kandw %k0, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kandw %k2, %k1, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
 ; KNL-NEXT:    kandw %k0, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kandw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kandw %k6, %k1, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; KNL-NEXT:    kandw %k2, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $8, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $8, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $7, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $7, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
 ; KNL-NEXT:    kandw %k3, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $6, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $6, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kandw %k4, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $5, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $5, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kandw %k5, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $4, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $4, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; KNL-NEXT:    kandw %k0, %k1, %k1
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $3, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
 ; KNL-NEXT:    kandw %k2, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $3, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; KNL-NEXT:    kandw %k5, %k1, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $2, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $2, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
 ; KNL-NEXT:    kandw %k5, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
 ; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    korw %k7, %k1, %k1
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; KNL-NEXT:    kandw %k5, %k7, %k7
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k6
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    korw %k6, %k1, %k1
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
 ; KNL-NEXT:    kshiftrw $14, %k6, %k6
+; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    korw %k6, %k7, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
 ; KNL-NEXT:    kandw %k5, %k6, %k6
@@ -2241,35 +2235,36 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $12, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kandw %k0, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $11, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; KNL-NEXT:    kandw %k0, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $10, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; KNL-NEXT:    kandw %k0, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $9, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; KNL-NEXT:    kandw %k0, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $8, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; KNL-NEXT:    kandw %k0, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
@@ -2287,21 +2282,20 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
 ; KNL-NEXT:    kshiftrw $5, %k6, %k6
 ; KNL-NEXT:    korw %k6, %k5, %k5
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; KNL-NEXT:    kandw %k0, %k5, %k4
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; KNL-NEXT:    kandw %k3, %k5, %k4
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k5
 ; KNL-NEXT:    kshiftlw $15, %k5, %k5
 ; KNL-NEXT:    kshiftrw $4, %k5, %k5
 ; KNL-NEXT:    korw %k5, %k4, %k4
-; KNL-NEXT:    kandw %k2, %k4, %k3
+; KNL-NEXT:    kandw %k0, %k4, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    kshiftrw $3, %k4, %k4
 ; KNL-NEXT:    korw %k4, %k3, %k3
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; KNL-NEXT:    kandw %k0, %k3, %k2
+; KNL-NEXT:    kandw %k2, %k3, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
@@ -2346,13 +2340,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ;
 ; AVX512DQNOBW-LABEL: test21:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    movw $-3, %ax
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    andl $1, %eax
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k0
-; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %k1, %k2
-; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
@@ -2360,8 +2350,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movw $-5, %ax
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k1, %k7
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
@@ -2377,9 +2368,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    kshiftrw $12, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movw $-17, %ax
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
-; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
+; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
@@ -2388,7 +2379,7 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    movw $-33, %ax
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
 ; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %k1, %k3
+; AVX512DQNOBW-NEXT:    kmovw %k1, %k2
 ; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
@@ -2407,7 +2398,7 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    movw $-129, %ax
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
 ; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %k1, %k4
+; AVX512DQNOBW-NEXT:    kmovw %k1, %k3
 ; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
@@ -2426,7 +2417,7 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    movw $-513, %ax # imm = 0xFDFF
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
 ; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %k1, %k5
+; AVX512DQNOBW-NEXT:    kmovw %k1, %k4
 ; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
@@ -2443,9 +2434,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    kshiftrw $5, %k1, %k1
 ; AVX512DQNOBW-NEXT:    korw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
-; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
+; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k1
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k1, %k1
@@ -2474,218 +2464,216 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQNOBW-NEXT:    kmovw %edi, %k0
-; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %esi, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    andl $1, %edi
+; AVX512DQNOBW-NEXT:    kmovw %esi, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %edi, %k6
+; AVX512DQNOBW-NEXT:    korw %k0, %k6, %k0
+; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %edx, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %ecx, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %edx, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %r8d, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %r9d, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %ecx, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %r8d, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kandw %k3, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw %r9d, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kandw %k4, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k0
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    andl $1, %eax
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; AVX512DQNOBW-NEXT:    kmovw %ecx, %k0
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQNOBW-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    korw %k0, %k6, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $13, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $14, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $12, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $13, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $12, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $11, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $11, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $10, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $10, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kandw %k6, %k0, %k0
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $9, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $9, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $8, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $7, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $7, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $6, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $6, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $5, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $4, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $4, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $3, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $3, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    kshiftrw $2, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $14, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQNOBW-NEXT:    korw %k7, %k0, %k0
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k5, %k7, %k7
-; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k6
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQNOBW-NEXT:    korw %k6, %k0, %k0
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQNOBW-NEXT:    andl $1, %eax
+; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; AVX512DQNOBW-NEXT:    kmovw %ecx, %k6
+; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
 ; AVX512DQNOBW-NEXT:    kshiftrw $14, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
 ; AVX512DQNOBW-NEXT:    korw %k6, %k7, %k6
 ; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
 ; AVX512DQNOBW-NEXT:    kandw %k5, %k6, %k6
@@ -2701,35 +2689,36 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQNOBW-NEXT:    kshiftrw $12, %k7, %k7
 ; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
-; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQNOBW-NEXT:    kshiftrw $11, %k7, %k7
 ; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQNOBW-NEXT:    kshiftrw $10, %k7, %k7
 ; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQNOBW-NEXT:    kshiftrw $9, %k7, %k7
 ; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQNOBW-NEXT:    kshiftrw $8, %k7, %k7
 ; AVX512DQNOBW-NEXT:    korw %k7, %k6, %k6
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k7
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k7, %k7
@@ -2747,21 +2736,20 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k6, %k6
 ; AVX512DQNOBW-NEXT:    kshiftrw $5, %k6, %k6
 ; AVX512DQNOBW-NEXT:    korw %k6, %k5, %k5
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k1, %k5, %k4
+; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512DQNOBW-NEXT:    kandw %k3, %k5, %k4
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k5
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k5, %k5
 ; AVX512DQNOBW-NEXT:    kshiftrw $4, %k5, %k5
 ; AVX512DQNOBW-NEXT:    korw %k5, %k4, %k4
-; AVX512DQNOBW-NEXT:    kandw %k2, %k4, %k3
+; AVX512DQNOBW-NEXT:    kandw %k1, %k4, %k3
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k4
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k4, %k4
 ; AVX512DQNOBW-NEXT:    kshiftrw $3, %k4, %k4
 ; AVX512DQNOBW-NEXT:    korw %k4, %k3, %k3
-; AVX512DQNOBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQNOBW-NEXT:    kandw %k1, %k3, %k2
+; AVX512DQNOBW-NEXT:    kandw %k2, %k3, %k2
 ; AVX512DQNOBW-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQNOBW-NEXT:    kmovw %eax, %k3
 ; AVX512DQNOBW-NEXT:    kshiftlw $15, %k3, %k3

diff  --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 232995800a98..53d681dbcf86 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -2915,20 +2915,17 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ;
 ; KNL-LABEL: store_64i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movw $-3, %ax
-; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    andl $1, %esi
 ; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k1, %k2
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    kmovw %edx, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $14, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    movw $-5, %ax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k1, %k7
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $13, %k1, %k1
@@ -2942,9 +2939,9 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; KNL-NEXT:    kshiftrw $12, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    movw $-17, %ax
-; KNL-NEXT:    kmovw %eax, %k6
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %r9d, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $11, %k1, %k1
@@ -2952,7 +2949,7 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; KNL-NEXT:    movw $-33, %ax
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k1, %k3
+; KNL-NEXT:    kmovw %k1, %k2
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -2971,7 +2968,7 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; KNL-NEXT:    movw $-129, %ax
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k1, %k4
+; KNL-NEXT:    kmovw %k1, %k3
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -2990,7 +2987,7 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; KNL-NEXT:    movw $-513, %ax ## imm = 0xFDFF
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k1, %k5
+; KNL-NEXT:    kmovw %k1, %k4
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -3007,9 +3004,8 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; KNL-NEXT:    kshiftrw $5, %k1, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k5
+; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -3038,224 +3034,222 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; KNL-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
+; KNL-NEXT:    kandw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    kandw %k7, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $8, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $8, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $7, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $7, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $6, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $6, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $5, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $5, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $4, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $4, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $3, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $3, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $2, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; KNL-NEXT:    kandw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $2, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    korw %k0, %k6, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; KNL-NEXT:    kandw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $13, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k7, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $12, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $11, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $10, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k6, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; KNL-NEXT:    kandw %k2, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $9, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $8, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $8, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $7, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $7, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k3, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $6, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $6, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k4, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $5, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $5, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $4, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $4, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $3, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $3, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $2, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    kshiftrw $2, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k5, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $14, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k6
+; KNL-NEXT:    kshiftlw $14, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; KNL-NEXT:    kmovw %eax, %k7
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
-; KNL-NEXT:    kandw %k5, %k7, %k7
-; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k6
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
+; KNL-NEXT:    korw %k6, %k0, %k0
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; KNL-NEXT:    kmovw %ecx, %k6
+; KNL-NEXT:    kshiftlw $15, %k6, %k6
 ; KNL-NEXT:    kshiftrw $14, %k6, %k6
+; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    korw %k6, %k7, %k6
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
 ; KNL-NEXT:    kandw %k5, %k6, %k6
@@ -3271,35 +3265,36 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $12, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kandw %k1, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $11, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $10, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $9, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL-NEXT:    kshiftrw $8, %k7, %k7
 ; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k6, %k6
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; KNL-NEXT:    kandw %k5, %k6, %k6
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k7
 ; KNL-NEXT:    kshiftlw $15, %k7, %k7
@@ -3317,21 +3312,20 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; KNL-NEXT:    kshiftlw $15, %k6, %k6
 ; KNL-NEXT:    kshiftrw $5, %k6, %k6
 ; KNL-NEXT:    korw %k6, %k5, %k5
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k5, %k4
+; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; KNL-NEXT:    kandw %k3, %k5, %k4
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k5
 ; KNL-NEXT:    kshiftlw $15, %k5, %k5
 ; KNL-NEXT:    kshiftrw $4, %k5, %k5
 ; KNL-NEXT:    korw %k5, %k4, %k4
-; KNL-NEXT:    kandw %k2, %k4, %k3
+; KNL-NEXT:    kandw %k1, %k4, %k3
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    kshiftrw $3, %k4, %k4
 ; KNL-NEXT:    korw %k4, %k3, %k3
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k3, %k2
+; KNL-NEXT:    kandw %k2, %k3, %k2
 ; KNL-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; KNL-NEXT:    kmovw %eax, %k3
 ; KNL-NEXT:    kshiftlw $15, %k3, %k3
@@ -3375,20 +3369,17 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ;
 ; AVX512DQ-LABEL: store_64i1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    movw $-3, %ax
-; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    andl $1, %esi
 ; AVX512DQ-NEXT:    kmovw %esi, %k0
-; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kmovw %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    kmovw %edx, %k1
 ; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movw $-5, %ax
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k1, %k7
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    kmovw %ecx, %k1
 ; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k1
@@ -3402,9 +3393,9 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movw $-17, %ax
-; AVX512DQ-NEXT:    kmovw %eax, %k6
-; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
-; AVX512DQ-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %r9d, %k1
 ; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k1
@@ -3412,7 +3403,7 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; AVX512DQ-NEXT:    movw $-33, %ax
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kmovw %k1, %k3
+; AVX512DQ-NEXT:    kmovw %k1, %k2
 ; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
@@ -3431,7 +3422,7 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; AVX512DQ-NEXT:    movw $-129, %ax
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kmovw %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k1, %k3
 ; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
@@ -3450,7 +3441,7 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; AVX512DQ-NEXT:    movw $-513, %ax ## imm = 0xFDFF
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kmovw %k1, %k5
+; AVX512DQ-NEXT:    kmovw %k1, %k4
 ; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
@@ -3467,9 +3458,8 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k1
 ; AVX512DQ-NEXT:    korw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
-; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k5
+; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k1
@@ -3498,224 +3488,222 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $14, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k0
-; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; AVX512DQ-NEXT:    kmovw %ecx, %k0
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    korw %k0, %k6, %k0
+; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $14, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $13, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $12, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $13, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $11, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $12, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $10, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $11, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $9, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $10, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $9, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kandw %k4, %k0, %k0
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $8, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $7, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $7, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $6, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $6, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $5, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $4, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $4, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $3, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $3, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $2, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $14, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; AVX512DQ-NEXT:    kmovw %ecx, %k0
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    korw %k0, %k6, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $13, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kandw %k7, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $14, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $12, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $13, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $12, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $11, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $11, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $10, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $10, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kandw %k6, %k0, %k0
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $9, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $9, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $8, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $7, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $7, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kandw %k3, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $6, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $6, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kandw %k4, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $5, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $4, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $4, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $3, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kandw %k2, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $3, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    kshiftrw $2, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kandw %k5, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $14, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %eax, %k6
+; AVX512DQ-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512DQ-NEXT:    korw %k7, %k0, %k0
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; AVX512DQ-NEXT:    kmovw %eax, %k7
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k5, %k7, %k7
-; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k6
 ; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512DQ-NEXT:    korw %k6, %k0, %k0
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; AVX512DQ-NEXT:    kmovw %ecx, %k6
+; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
 ; AVX512DQ-NEXT:    kshiftrw $14, %k6, %k6
+; AVX512DQ-NEXT:    kmovw %eax, %k7
 ; AVX512DQ-NEXT:    korw %k6, %k7, %k6
 ; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
 ; AVX512DQ-NEXT:    kandw %k5, %k6, %k6
@@ -3731,35 +3719,36 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQ-NEXT:    kshiftrw $12, %k7, %k7
 ; AVX512DQ-NEXT:    korw %k7, %k6, %k6
-; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
 ; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQ-NEXT:    kshiftrw $11, %k7, %k7
 ; AVX512DQ-NEXT:    korw %k7, %k6, %k6
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
 ; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQ-NEXT:    kshiftrw $10, %k7, %k7
 ; AVX512DQ-NEXT:    korw %k7, %k6, %k6
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
 ; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQ-NEXT:    kshiftrw $9, %k7, %k7
 ; AVX512DQ-NEXT:    korw %k7, %k6, %k6
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
 ; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
 ; AVX512DQ-NEXT:    kshiftrw $8, %k7, %k7
 ; AVX512DQ-NEXT:    korw %k7, %k6, %k6
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k1, %k6, %k6
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k5, %k6, %k6
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k7
 ; AVX512DQ-NEXT:    kshiftlw $15, %k7, %k7
@@ -3777,21 +3766,20 @@ define void @store_64i1(ptr %a, <64 x i1> %v) {
 ; AVX512DQ-NEXT:    kshiftlw $15, %k6, %k6
 ; AVX512DQ-NEXT:    kshiftrw $5, %k6, %k6
 ; AVX512DQ-NEXT:    korw %k6, %k5, %k5
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k1, %k5, %k4
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload
+; AVX512DQ-NEXT:    kandw %k3, %k5, %k4
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k5
 ; AVX512DQ-NEXT:    kshiftlw $15, %k5, %k5
 ; AVX512DQ-NEXT:    kshiftrw $4, %k5, %k5
 ; AVX512DQ-NEXT:    korw %k5, %k4, %k4
-; AVX512DQ-NEXT:    kandw %k2, %k4, %k3
+; AVX512DQ-NEXT:    kandw %k1, %k4, %k3
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k4
 ; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
 ; AVX512DQ-NEXT:    kshiftrw $3, %k4, %k4
 ; AVX512DQ-NEXT:    korw %k4, %k3, %k3
-; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; AVX512DQ-NEXT:    kandw %k1, %k3, %k2
+; AVX512DQ-NEXT:    kandw %k2, %k3, %k2
 ; AVX512DQ-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k3
 ; AVX512DQ-NEXT:    kshiftlw $15, %k3, %k3

diff  --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index da4439beac5a..df6cd6e9bc57 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1130,11 +1130,10 @@ define dso_local void @v64i1_shuffle(<64 x i8>* %x, <64 x i8>* %y) "min-legal-ve
 ; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k0
 ; CHECK-NEXT:    kshiftrd $1, %k0, %k1
-; CHECK-NEXT:    movq $-3, %rax
-; CHECK-NEXT:    kmovq %rax, %k2
-; CHECK-NEXT:    kandq %k2, %k1, %k1
 ; CHECK-NEXT:    kshiftlq $63, %k0, %k2
 ; CHECK-NEXT:    kshiftrq $62, %k2, %k2
+; CHECK-NEXT:    kshiftlq $63, %k1, %k1
+; CHECK-NEXT:    kshiftrq $63, %k1, %k1
 ; CHECK-NEXT:    korq %k2, %k1, %k1
 ; CHECK-NEXT:    movq $-5, %rax
 ; CHECK-NEXT:    kmovq %rax, %k2

diff  --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll
index da21aa68e759..74a691e52094 100644
--- a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll
@@ -709,15 +709,14 @@ define <2 x i16> @test_v2f16_oeq_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movb $-3, %al
-; X86-NEXT:    kmovd %eax, %k0
 ; X86-NEXT:    vucomish 8(%ebp), %xmm2
 ; X86-NEXT:    setnp %al
 ; X86-NEXT:    sete %cl
 ; X86-NEXT:    testb %al, %cl
 ; X86-NEXT:    setne %al
-; X86-NEXT:    kmovd %eax, %k1
-; X86-NEXT:    kandb %k0, %k1, %k0
+; X86-NEXT:    kmovd %eax, %k0
+; X86-NEXT:    kshiftlb $7, %k0, %k0
+; X86-NEXT:    kshiftrb $7, %k0, %k0
 ; X86-NEXT:    vpsrld $16, %xmm2, %xmm2
 ; X86-NEXT:    vucomish 10(%ebp), %xmm2
 ; X86-NEXT:    setnp %al
@@ -741,9 +740,8 @@ define <2 x i16> @test_v2f16_oeq_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <
 ; X64-NEXT:    testb %al, %cl
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    kmovd %eax, %k0
-; X64-NEXT:    movb $-3, %al
-; X64-NEXT:    kmovd %eax, %k1
-; X64-NEXT:    kandb %k1, %k0, %k0
+; X64-NEXT:    kshiftlb $7, %k0, %k0
+; X64-NEXT:    kshiftrb $7, %k0, %k0
 ; X64-NEXT:    vpsrld $16, %xmm3, %xmm3
 ; X64-NEXT:    vpsrld $16, %xmm2, %xmm2
 ; X64-NEXT:    vucomish %xmm3, %xmm2
@@ -771,12 +769,11 @@ define <2 x i16> @test_v2f16_ogt_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movb $-3, %al
-; X86-NEXT:    kmovd %eax, %k0
 ; X86-NEXT:    vcomish 8(%ebp), %xmm2
 ; X86-NEXT:    seta %al
-; X86-NEXT:    kmovd %eax, %k1
-; X86-NEXT:    kandb %k0, %k1, %k0
+; X86-NEXT:    kmovd %eax, %k0
+; X86-NEXT:    kshiftlb $7, %k0, %k0
+; X86-NEXT:    kshiftrb $7, %k0, %k0
 ; X86-NEXT:    vpsrld $16, %xmm2, %xmm2
 ; X86-NEXT:    vcomish 10(%ebp), %xmm2
 ; X86-NEXT:    seta %al
@@ -791,12 +788,11 @@ define <2 x i16> @test_v2f16_ogt_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <
 ;
 ; X64-LABEL: test_v2f16_ogt_q:
 ; X64:       # %bb.0:
-; X64-NEXT:    movb $-3, %al
-; X64-NEXT:    kmovd %eax, %k0
 ; X64-NEXT:    vcomish %xmm3, %xmm2
 ; X64-NEXT:    seta %al
-; X64-NEXT:    kmovd %eax, %k1
-; X64-NEXT:    kandb %k0, %k1, %k0
+; X64-NEXT:    kmovd %eax, %k0
+; X64-NEXT:    kshiftlb $7, %k0, %k0
+; X64-NEXT:    kshiftrb $7, %k0, %k0
 ; X64-NEXT:    vpsrld $16, %xmm3, %xmm3
 ; X64-NEXT:    vpsrld $16, %xmm2, %xmm2
 ; X64-NEXT:    vcomish %xmm3, %xmm2
@@ -821,12 +817,11 @@ define <4 x i16> @test_v4f16_oge_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movb $-3, %al
-; X86-NEXT:    kmovd %eax, %k0
 ; X86-NEXT:    vucomish 8(%ebp), %xmm2
 ; X86-NEXT:    setae %al
-; X86-NEXT:    kmovd %eax, %k1
-; X86-NEXT:    kandb %k0, %k1, %k0
+; X86-NEXT:    kmovd %eax, %k0
+; X86-NEXT:    kshiftlb $7, %k0, %k0
+; X86-NEXT:    kshiftrb $7, %k0, %k0
 ; X86-NEXT:    vpsrld $16, %xmm2, %xmm3
 ; X86-NEXT:    vucomish 10(%ebp), %xmm3
 ; X86-NEXT:    setae %al
@@ -861,12 +856,11 @@ define <4 x i16> @test_v4f16_oge_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <
 ;
 ; X64-LABEL: test_v4f16_oge_q:
 ; X64:       # %bb.0:
-; X64-NEXT:    movb $-3, %al
-; X64-NEXT:    kmovd %eax, %k0
 ; X64-NEXT:    vucomish %xmm3, %xmm2
 ; X64-NEXT:    setae %al
-; X64-NEXT:    kmovd %eax, %k1
-; X64-NEXT:    kandb %k0, %k1, %k0
+; X64-NEXT:    kmovd %eax, %k0
+; X64-NEXT:    kshiftlb $7, %k0, %k0
+; X64-NEXT:    kshiftrb $7, %k0, %k0
 ; X64-NEXT:    vpsrld $16, %xmm3, %xmm4
 ; X64-NEXT:    vpsrld $16, %xmm2, %xmm5
 ; X64-NEXT:    vucomish %xmm4, %xmm5
@@ -913,13 +907,12 @@ define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movb $-3, %al
-; X86-NEXT:    kmovd %eax, %k0
 ; X86-NEXT:    vmovsh 8(%ebp), %xmm3
 ; X86-NEXT:    vcomish %xmm2, %xmm3
 ; X86-NEXT:    seta %al
-; X86-NEXT:    kmovd %eax, %k1
-; X86-NEXT:    kandb %k0, %k1, %k0
+; X86-NEXT:    kmovd %eax, %k0
+; X86-NEXT:    kshiftlb $7, %k0, %k0
+; X86-NEXT:    kshiftrb $7, %k0, %k0
 ; X86-NEXT:    vpsrld $16, %xmm2, %xmm3
 ; X86-NEXT:    vmovsh 10(%ebp), %xmm4
 ; X86-NEXT:    vcomish %xmm3, %xmm4
@@ -957,12 +950,11 @@ define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <
 ;
 ; X64-LABEL: test_v4f16_olt_q:
 ; X64:       # %bb.0:
-; X64-NEXT:    movb $-3, %al
-; X64-NEXT:    kmovd %eax, %k0
 ; X64-NEXT:    vcomish %xmm2, %xmm3
 ; X64-NEXT:    seta %al
-; X64-NEXT:    kmovd %eax, %k1
-; X64-NEXT:    kandb %k0, %k1, %k0
+; X64-NEXT:    kmovd %eax, %k0
+; X64-NEXT:    kshiftlb $7, %k0, %k0
+; X64-NEXT:    kshiftrb $7, %k0, %k0
 ; X64-NEXT:    vpsrld $16, %xmm2, %xmm4
 ; X64-NEXT:    vpsrld $16, %xmm3, %xmm5
 ; X64-NEXT:    vcomish %xmm4, %xmm5

diff  --git a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
index 1992c9dd2195..1e56ddc0c8ec 100644
--- a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
@@ -97,12 +97,10 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; AVX512-32-NEXT:    movl %esp, %ebp
 ; AVX512-32-NEXT:    andl $-16, %esp
 ; AVX512-32-NEXT:    subl $16, %esp
-; AVX512-32-NEXT:    movw $-3, %ax
-; AVX512-32-NEXT:    kmovw %eax, %k0
 ; AVX512-32-NEXT:    vcomiss 8(%ebp), %xmm2
 ; AVX512-32-NEXT:    seta %al
-; AVX512-32-NEXT:    kmovw %eax, %k1
-; AVX512-32-NEXT:    kandw %k0, %k1, %k0
+; AVX512-32-NEXT:    andl $1, %eax
+; AVX512-32-NEXT:    kmovw %eax, %k0
 ; AVX512-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512-32-NEXT:    vcomiss 12(%ebp), %xmm2
 ; AVX512-32-NEXT:    seta %al
@@ -117,12 +115,10 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ;
 ; AVX512-64-LABEL: test_v2f32_ogt_s:
 ; AVX512-64:       # %bb.0:
-; AVX512-64-NEXT:    movw $-3, %ax
-; AVX512-64-NEXT:    kmovw %eax, %k0
 ; AVX512-64-NEXT:    vcomiss %xmm3, %xmm2
 ; AVX512-64-NEXT:    seta %al
-; AVX512-64-NEXT:    kmovw %eax, %k1
-; AVX512-64-NEXT:    kandw %k0, %k1, %k0
+; AVX512-64-NEXT:    andl $1, %eax
+; AVX512-64-NEXT:    kmovw %eax, %k0
 ; AVX512-64-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; AVX512-64-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512-64-NEXT:    vcomiss %xmm3, %xmm2
@@ -142,12 +138,10 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; AVX512F-32-NEXT:    subl $16, %esp
 ; AVX512F-32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-32-NEXT:    movw $-3, %ax
-; AVX512F-32-NEXT:    kmovw %eax, %k0
 ; AVX512F-32-NEXT:    vcomiss 8(%ebp), %xmm2
 ; AVX512F-32-NEXT:    seta %al
-; AVX512F-32-NEXT:    kmovw %eax, %k1
-; AVX512F-32-NEXT:    kandw %k0, %k1, %k0
+; AVX512F-32-NEXT:    andl $1, %eax
+; AVX512F-32-NEXT:    kmovw %eax, %k0
 ; AVX512F-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512F-32-NEXT:    vcomiss 12(%ebp), %xmm2
 ; AVX512F-32-NEXT:    seta %al
@@ -166,12 +160,10 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-64-NEXT:    movw $-3, %ax
-; AVX512F-64-NEXT:    kmovw %eax, %k0
 ; AVX512F-64-NEXT:    vcomiss %xmm3, %xmm2
 ; AVX512F-64-NEXT:    seta %al
-; AVX512F-64-NEXT:    kmovw %eax, %k1
-; AVX512F-64-NEXT:    kandw %k0, %k1, %k0
+; AVX512F-64-NEXT:    andl $1, %eax
+; AVX512F-64-NEXT:    kmovw %eax, %k0
 ; AVX512F-64-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; AVX512F-64-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512F-64-NEXT:    vcomiss %xmm3, %xmm2
@@ -288,15 +280,13 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; AVX512-32-NEXT:    movl %esp, %ebp
 ; AVX512-32-NEXT:    andl $-16, %esp
 ; AVX512-32-NEXT:    subl $16, %esp
-; AVX512-32-NEXT:    movw $-3, %ax
-; AVX512-32-NEXT:    kmovw %eax, %k0
 ; AVX512-32-NEXT:    vucomiss 8(%ebp), %xmm2
 ; AVX512-32-NEXT:    setnp %al
 ; AVX512-32-NEXT:    sete %cl
 ; AVX512-32-NEXT:    testb %al, %cl
 ; AVX512-32-NEXT:    setne %al
-; AVX512-32-NEXT:    kmovw %eax, %k1
-; AVX512-32-NEXT:    kandw %k0, %k1, %k0
+; AVX512-32-NEXT:    andl $1, %eax
+; AVX512-32-NEXT:    kmovw %eax, %k0
 ; AVX512-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512-32-NEXT:    vucomiss 12(%ebp), %xmm2
 ; AVX512-32-NEXT:    setnp %al
@@ -319,10 +309,8 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; AVX512-64-NEXT:    sete %cl
 ; AVX512-64-NEXT:    testb %al, %cl
 ; AVX512-64-NEXT:    setne %al
+; AVX512-64-NEXT:    andl $1, %eax
 ; AVX512-64-NEXT:    kmovw %eax, %k0
-; AVX512-64-NEXT:    movw $-3, %ax
-; AVX512-64-NEXT:    kmovw %eax, %k1
-; AVX512-64-NEXT:    kandw %k1, %k0, %k0
 ; AVX512-64-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; AVX512-64-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512-64-NEXT:    vucomiss %xmm3, %xmm2
@@ -345,15 +333,13 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; AVX512F-32-NEXT:    subl $16, %esp
 ; AVX512F-32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-32-NEXT:    movw $-3, %ax
-; AVX512F-32-NEXT:    kmovw %eax, %k0
 ; AVX512F-32-NEXT:    vucomiss 8(%ebp), %xmm2
 ; AVX512F-32-NEXT:    setnp %al
 ; AVX512F-32-NEXT:    sete %cl
 ; AVX512F-32-NEXT:    testb %al, %cl
 ; AVX512F-32-NEXT:    setne %al
-; AVX512F-32-NEXT:    kmovw %eax, %k1
-; AVX512F-32-NEXT:    kandw %k0, %k1, %k0
+; AVX512F-32-NEXT:    andl $1, %eax
+; AVX512F-32-NEXT:    kmovw %eax, %k0
 ; AVX512F-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512F-32-NEXT:    vucomiss 12(%ebp), %xmm2
 ; AVX512F-32-NEXT:    setnp %al
@@ -380,10 +366,8 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; AVX512F-64-NEXT:    sete %cl
 ; AVX512F-64-NEXT:    testb %al, %cl
 ; AVX512F-64-NEXT:    setne %al
+; AVX512F-64-NEXT:    andl $1, %eax
 ; AVX512F-64-NEXT:    kmovw %eax, %k0
-; AVX512F-64-NEXT:    movw $-3, %ax
-; AVX512F-64-NEXT:    kmovw %eax, %k1
-; AVX512F-64-NEXT:    kandw %k1, %k0, %k0
 ; AVX512F-64-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; AVX512F-64-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512F-64-NEXT:    vucomiss %xmm3, %xmm2

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 0dd634f7dc1e..f381213d06e6 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -2926,10 +2926,8 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; AVX512F-NEXT:    vmovq %rcx, %xmm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512F-NEXT:    seto %al
-; AVX512F-NEXT:    movw $-3, %cx
-; AVX512F-NEXT:    kmovw %ecx, %k0
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    kandw %k0, %k1, %k0
+; AVX512F-NEXT:    andl $1, %eax
+; AVX512F-NEXT:    kmovw %eax, %k0
 ; AVX512F-NEXT:    kmovw %edx, %k1
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
@@ -2952,10 +2950,8 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; AVX512BW-NEXT:    vmovq %rcx, %xmm1
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512BW-NEXT:    seto %al
-; AVX512BW-NEXT:    movw $-3, %cx
-; AVX512BW-NEXT:    kmovd %ecx, %k0
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    kmovw %eax, %k0
 ; AVX512BW-NEXT:    kmovd %edx, %k1
 ; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1

diff  --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 72fbd8a2c99f..dffb86adc39a 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2604,10 +2604,8 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; AVX512F-NEXT:    vmovq %rax, %xmm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512F-NEXT:    seto %al
-; AVX512F-NEXT:    movw $-3, %cx
-; AVX512F-NEXT:    kmovw %ecx, %k0
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    kandw %k0, %k1, %k0
+; AVX512F-NEXT:    andl $1, %eax
+; AVX512F-NEXT:    kmovw %eax, %k0
 ; AVX512F-NEXT:    kmovw %r8d, %k1
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
@@ -2631,10 +2629,8 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; AVX512BW-NEXT:    vmovq %rax, %xmm1
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512BW-NEXT:    seto %al
-; AVX512BW-NEXT:    movw $-3, %cx
-; AVX512BW-NEXT:    kmovd %ecx, %k0
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    kmovw %eax, %k0
 ; AVX512BW-NEXT:    kmovd %r8d, %k1
 ; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1


        


More information about the llvm-commits mailing list