[llvm] a4124e4 - [X86] When storing v1i1/v2i1/v4i1 to memory, make sure we store zeros in the rest of the byte

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 12 21:28:54 PST 2020


Author: Craig Topper
Date: 2020-11-12T21:28:18-08:00
New Revision: a4124e455e641db1e18d4221d2dacb31953fd13b

URL: https://github.com/llvm/llvm-project/commit/a4124e455e641db1e18d4221d2dacb31953fd13b
DIFF: https://github.com/llvm/llvm-project/commit/a4124e455e641db1e18d4221d2dacb31953fd13b.diff

LOG: [X86] When storing v1i1/v2i1/v4i1 to memory, make sure we store zeros in the rest of the byte

We can't store garbage in the unused bits. It possible that something like zextload from i1/i2/i4 is created to read the memory. Those zextloads would be legalized assuming the extra bits are 0.

I'm not sure that the code in lowerStore is executed for the v1i1/v2i1/v4i1 case. It looks like the DAG combine in combineStore may have converted them to v8i1 first. And I think we're missing some cases to avoid going to the stack in the first place. But I don't have time to investigate those things at the moment so I wanted to focus on the correctness issue.

Should fix PR48147.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D91294

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86InstrAVX512.td
    llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
    llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
    llvm/test/CodeGen/X86/avx512-mask-op.ll
    llvm/test/CodeGen/X86/avx512-select.ll
    llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll
    llvm/test/CodeGen/X86/vec_saddo.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/vec_ssubo.ll
    llvm/test/CodeGen/X86/vec_uaddo.ll
    llvm/test/CodeGen/X86/vec_umulo.ll
    llvm/test/CodeGen/X86/vec_usubo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5b0e9fa75351..f1956d77d615 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23870,17 +23870,22 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
   if (StoredVal.getValueType().isVector() &&
       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
-    assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
-           "Unexpected VT");
+    unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
+    assert(NumElts <= 8 && "Unexpected VT");
     assert(!St->isTruncatingStore() && "Expected non-truncating store");
     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
            "Expected AVX512F without AVX512DQI");
 
+    // We must pad with zeros to ensure we store zeroes to any unused bits.
     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
                             DAG.getUNDEF(MVT::v16i1), StoredVal,
                             DAG.getIntPtrConstant(0, dl));
     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
+    // Make sure we store zeros in the extra bits.
+    if (NumElts < 8)
+      StoredVal = DAG.getZeroExtendInReg(StoredVal, dl,
+                                         MVT::getIntegerVT(NumElts));
 
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                         St->getPointerInfo(), St->getOriginalAlign(),
@@ -44971,17 +44976,21 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
       StoredVal.getOperand(0).getValueType() == MVT::i8) {
-    return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
+    SDValue Val = StoredVal.getOperand(0);
+    // We must store zeros to the unused bits.
+    Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
+    return DAG.getStore(St->getChain(), dl, Val,
                         St->getBasePtr(), St->getPointerInfo(),
                         St->getOriginalAlign(),
                         St->getMemOperand()->getFlags());
   }
 
   // Widen v2i1/v4i1 stores to v8i1.
-  if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
+  if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
       Subtarget.hasAVX512()) {
     unsigned NumConcats = 8 / VT.getVectorNumElements();
-    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
+    // We must store zeros to the unused bits.
+    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
     Ops[0] = StoredVal;
     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

diff  --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 3d8fbdc3b82b..339fd0016435 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2921,9 +2921,6 @@ def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
 
 // Load/store kreg
 let Predicates = [HasDQI] in {
-  def : Pat<(store VK1:$src, addr:$dst),
-            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
-
   def : Pat<(v1i1 (load addr:$src)),
             (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>;
   def : Pat<(v2i1 (load addr:$src)),

diff  --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
index 7b7ddf72123b..deed569c3165 100644
--- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -593,6 +593,8 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovb (%rdi), %k0
 ; AVX512-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -600,6 +602,8 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -619,6 +623,8 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-NEXT:    cmovel %ecx, %eax
 ; AVX512-NEXT:    kmovd %eax, %k0
 ; AVX512-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -632,6 +638,8 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ-NEXT:    cmovel %ecx, %eax
 ; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
 ; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -649,6 +657,8 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-NEXT:    cmovel %eax, %ecx
 ; AVX512-NEXT:    kmovd %ecx, %k0
 ; AVX512-NEXT:    kshiftrb $2, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -660,6 +670,8 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ-NEXT:    cmovel %eax, %ecx
 ; AVX512NOTDQ-NEXT:    kmovd %ecx, %k0
 ; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -673,6 +685,8 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovb (%rdi), %k0
 ; AVX512-NEXT:    kshiftrb $2, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -680,6 +694,8 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -693,6 +709,8 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovb (%rdi), %k0
 ; AVX512-NEXT:    kshiftrb $3, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -700,6 +718,8 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -713,6 +733,8 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovb (%rdi), %k0
 ; AVX512-NEXT:    kshiftrb $4, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -720,6 +742,8 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -760,6 +784,8 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovb (%rdi), %k0
 ; AVX512-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -767,6 +793,8 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -807,6 +835,8 @@ define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovw (%rdi), %k0
 ; AVX512-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -814,6 +844,8 @@ define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -881,6 +913,8 @@ define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovw (%rdi), %k0
 ; AVX512-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -888,6 +922,8 @@ define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -955,6 +991,8 @@ define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovd (%rdi), %k0
 ; AVX512-NEXT:    kshiftrd $16, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -962,6 +1000,8 @@ define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -1056,6 +1096,8 @@ define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovd (%rdi), %k0
 ; AVX512-NEXT:    kshiftrd $31, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -1063,6 +1105,8 @@ define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrd $31, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -1160,6 +1204,8 @@ define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovq (%rdi), %k0
 ; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -1167,6 +1213,8 @@ define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
@@ -1286,6 +1334,8 @@ define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovq (%rdi), %k0
 ; AVX512-NEXT:    kshiftrq $63, %k0, %k0
+; AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
@@ -1293,6 +1343,8 @@ define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrq $63, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll b/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
index c3bcebe00e39..95e216632f6a 100644
--- a/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
+++ b/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
@@ -5,13 +5,18 @@
 define void @load_v1i2_trunc_v1i1_store(<1 x i2>* %a0,<1 x i1>* %a1) {
 ; AVX512-ALL-LABEL: load_v1i2_trunc_v1i1_store:
 ; AVX512-ALL:       # %bb.0:
-; AVX512-ALL-NEXT:    movb (%rdi), %al
-; AVX512-ALL-NEXT:    movb %al, (%rsi)
+; AVX512-ALL-NEXT:    kmovb (%rdi), %k0
+; AVX512-ALL-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-ALL-NEXT:    retq
 ;
 ; AVX512-ONLY-LABEL: load_v1i2_trunc_v1i1_store:
 ; AVX512-ONLY:       # %bb.0:
 ; AVX512-ONLY-NEXT:    movb (%rdi), %al
+; AVX512-ONLY-NEXT:    andl $1, %eax
+; AVX512-ONLY-NEXT:    kmovw %eax, %k0
+; AVX512-ONLY-NEXT:    kmovw %k0, %eax
 ; AVX512-ONLY-NEXT:    movb %al, (%rsi)
 ; AVX512-ONLY-NEXT:    retq
     %d0 = load <1 x i2>, <1 x i2>* %a0
@@ -22,13 +27,18 @@ define void @load_v1i2_trunc_v1i1_store(<1 x i2>* %a0,<1 x i1>* %a1) {
 define void @load_v1i3_trunc_v1i1_store(<1 x i3>* %a0,<1 x i1>* %a1) {
 ; AVX512-ALL-LABEL: load_v1i3_trunc_v1i1_store:
 ; AVX512-ALL:       # %bb.0:
-; AVX512-ALL-NEXT:    movb (%rdi), %al
-; AVX512-ALL-NEXT:    movb %al, (%rsi)
+; AVX512-ALL-NEXT:    kmovb (%rdi), %k0
+; AVX512-ALL-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-ALL-NEXT:    retq
 ;
 ; AVX512-ONLY-LABEL: load_v1i3_trunc_v1i1_store:
 ; AVX512-ONLY:       # %bb.0:
 ; AVX512-ONLY-NEXT:    movb (%rdi), %al
+; AVX512-ONLY-NEXT:    andl $1, %eax
+; AVX512-ONLY-NEXT:    kmovw %eax, %k0
+; AVX512-ONLY-NEXT:    kmovw %k0, %eax
 ; AVX512-ONLY-NEXT:    movb %al, (%rsi)
 ; AVX512-ONLY-NEXT:    retq
     %d0 = load <1 x i3>, <1 x i3>* %a0
@@ -39,13 +49,18 @@ define void @load_v1i3_trunc_v1i1_store(<1 x i3>* %a0,<1 x i1>* %a1) {
 define void @load_v1i4_trunc_v1i1_store(<1 x i4>* %a0,<1 x i1>* %a1) {
 ; AVX512-ALL-LABEL: load_v1i4_trunc_v1i1_store:
 ; AVX512-ALL:       # %bb.0:
-; AVX512-ALL-NEXT:    movb (%rdi), %al
-; AVX512-ALL-NEXT:    movb %al, (%rsi)
+; AVX512-ALL-NEXT:    kmovb (%rdi), %k0
+; AVX512-ALL-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-ALL-NEXT:    retq
 ;
 ; AVX512-ONLY-LABEL: load_v1i4_trunc_v1i1_store:
 ; AVX512-ONLY:       # %bb.0:
 ; AVX512-ONLY-NEXT:    movb (%rdi), %al
+; AVX512-ONLY-NEXT:    andl $1, %eax
+; AVX512-ONLY-NEXT:    kmovw %eax, %k0
+; AVX512-ONLY-NEXT:    kmovw %k0, %eax
 ; AVX512-ONLY-NEXT:    movb %al, (%rsi)
 ; AVX512-ONLY-NEXT:    retq
     %d0 = load <1 x i4>, <1 x i4>* %a0
@@ -56,13 +71,18 @@ define void @load_v1i4_trunc_v1i1_store(<1 x i4>* %a0,<1 x i1>* %a1) {
 define void @load_v1i8_trunc_v1i1_store(<1 x i8>* %a0,<1 x i1>* %a1) {
 ; AVX512-ALL-LABEL: load_v1i8_trunc_v1i1_store:
 ; AVX512-ALL:       # %bb.0:
-; AVX512-ALL-NEXT:    movb (%rdi), %al
-; AVX512-ALL-NEXT:    movb %al, (%rsi)
+; AVX512-ALL-NEXT:    kmovb (%rdi), %k0
+; AVX512-ALL-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-ALL-NEXT:    retq
 ;
 ; AVX512-ONLY-LABEL: load_v1i8_trunc_v1i1_store:
 ; AVX512-ONLY:       # %bb.0:
 ; AVX512-ONLY-NEXT:    movb (%rdi), %al
+; AVX512-ONLY-NEXT:    andl $1, %eax
+; AVX512-ONLY-NEXT:    kmovw %eax, %k0
+; AVX512-ONLY-NEXT:    kmovw %k0, %eax
 ; AVX512-ONLY-NEXT:    movb %al, (%rsi)
 ; AVX512-ONLY-NEXT:    retq
     %d0 = load <1 x i8>, <1 x i8>* %a0
@@ -73,13 +93,18 @@ define void @load_v1i8_trunc_v1i1_store(<1 x i8>* %a0,<1 x i1>* %a1) {
 define void @load_v1i16_trunc_v1i1_store(<1 x i16>* %a0,<1 x i1>* %a1) {
 ; AVX512-ALL-LABEL: load_v1i16_trunc_v1i1_store:
 ; AVX512-ALL:       # %bb.0:
-; AVX512-ALL-NEXT:    movb (%rdi), %al
-; AVX512-ALL-NEXT:    movb %al, (%rsi)
+; AVX512-ALL-NEXT:    kmovb (%rdi), %k0
+; AVX512-ALL-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-ALL-NEXT:    retq
 ;
 ; AVX512-ONLY-LABEL: load_v1i16_trunc_v1i1_store:
 ; AVX512-ONLY:       # %bb.0:
 ; AVX512-ONLY-NEXT:    movb (%rdi), %al
+; AVX512-ONLY-NEXT:    andl $1, %eax
+; AVX512-ONLY-NEXT:    kmovw %eax, %k0
+; AVX512-ONLY-NEXT:    kmovw %k0, %eax
 ; AVX512-ONLY-NEXT:    movb %al, (%rsi)
 ; AVX512-ONLY-NEXT:    retq
     %d0 = load <1 x i16>, <1 x i16>* %a0
@@ -90,13 +115,18 @@ define void @load_v1i16_trunc_v1i1_store(<1 x i16>* %a0,<1 x i1>* %a1) {
 define void @load_v1i32_trunc_v1i1_store(<1 x i32>* %a0,<1 x i1>* %a1) {
 ; AVX512-ALL-LABEL: load_v1i32_trunc_v1i1_store:
 ; AVX512-ALL:       # %bb.0:
-; AVX512-ALL-NEXT:    movb (%rdi), %al
-; AVX512-ALL-NEXT:    movb %al, (%rsi)
+; AVX512-ALL-NEXT:    kmovb (%rdi), %k0
+; AVX512-ALL-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-ALL-NEXT:    retq
 ;
 ; AVX512-ONLY-LABEL: load_v1i32_trunc_v1i1_store:
 ; AVX512-ONLY:       # %bb.0:
 ; AVX512-ONLY-NEXT:    movb (%rdi), %al
+; AVX512-ONLY-NEXT:    andl $1, %eax
+; AVX512-ONLY-NEXT:    kmovw %eax, %k0
+; AVX512-ONLY-NEXT:    kmovw %k0, %eax
 ; AVX512-ONLY-NEXT:    movb %al, (%rsi)
 ; AVX512-ONLY-NEXT:    retq
     %d0 = load <1 x i32>, <1 x i32>* %a0
@@ -107,13 +137,18 @@ define void @load_v1i32_trunc_v1i1_store(<1 x i32>* %a0,<1 x i1>* %a1) {
 define void @load_v1i64_trunc_v1i1_store(<1 x i64>* %a0,<1 x i1>* %a1) {
 ; AVX512-ALL-LABEL: load_v1i64_trunc_v1i1_store:
 ; AVX512-ALL:       # %bb.0:
-; AVX512-ALL-NEXT:    movb (%rdi), %al
-; AVX512-ALL-NEXT:    movb %al, (%rsi)
+; AVX512-ALL-NEXT:    kmovb (%rdi), %k0
+; AVX512-ALL-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-ALL-NEXT:    retq
 ;
 ; AVX512-ONLY-LABEL: load_v1i64_trunc_v1i1_store:
 ; AVX512-ONLY:       # %bb.0:
 ; AVX512-ONLY-NEXT:    movb (%rdi), %al
+; AVX512-ONLY-NEXT:    andl $1, %eax
+; AVX512-ONLY-NEXT:    kmovw %eax, %k0
+; AVX512-ONLY-NEXT:    kmovw %k0, %eax
 ; AVX512-ONLY-NEXT:    movb %al, (%rsi)
 ; AVX512-ONLY-NEXT:    retq
     %d0 = load <1 x i64>, <1 x i64>* %a0

diff  --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 8fa7fcc21e62..5df6842994f0 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1455,6 +1455,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
 ; KNL-NEXT:    vzeroupper
@@ -1471,6 +1473,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movb %al, (%rdi)
 ; AVX512BW-NEXT:    vzeroupper
@@ -1480,6 +1484,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
 ; AVX512DQ:       ## %bb.0:
 ; AVX512DQ-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT:    kshiftlb $4, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $4, %k0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -1500,6 +1506,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
 ; KNL-NEXT:    vzeroupper
@@ -1516,6 +1524,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movb %al, (%rdi)
 ; AVX512BW-NEXT:    vzeroupper
@@ -1525,6 +1535,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
 ; AVX512DQ:       ## %bb.0:
 ; AVX512DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovq2m %zmm0, %k0
+; AVX512DQ-NEXT:    kshiftlb $6, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -1545,6 +1557,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    knotw %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rsi)
 ; KNL-NEXT:    retq
@@ -1553,6 +1567,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovd %edi, %k0
 ; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $7, %k0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rsi)
 ; SKX-NEXT:    retq
 ;
@@ -1560,6 +1576,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovd %edi, %k0
 ; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movb %al, (%rsi)
 ; AVX512BW-NEXT:    retq
@@ -1568,6 +1586,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
 ; AVX512DQ:       ## %bb.0:
 ; AVX512DQ-NEXT:    kmovw %edi, %k0
 ; AVX512DQ-NEXT:    knotw %k0, %k0
+; AVX512DQ-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, (%rsi)
 ; AVX512DQ-NEXT:    retq
 ;
@@ -1576,6 +1596,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    knotw %k0, %k0
+; X86-NEXT:    kshiftlb $7, %k0, %k0
+; X86-NEXT:    kshiftrb $7, %k0, %k0
 ; X86-NEXT:    kmovb %k0, (%eax)
 ; X86-NEXT:    retl
   %x = xor <1 x i1> %c, <i1 1>
@@ -1588,6 +1610,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; KNL-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
 ; KNL-NEXT:    vzeroupper
@@ -1598,6 +1622,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
 ; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovq2m %xmm0, %k0
 ; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kshiftlb $6, %k0, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
 ;
@@ -1605,6 +1631,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movb %al, (%rdi)
 ; AVX512BW-NEXT:    vzeroupper
@@ -1615,6 +1643,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
 ; AVX512DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovq2m %zmm0, %k0
 ; AVX512DQ-NEXT:    knotw %k0, %k0
+; AVX512DQ-NEXT:    kshiftlb $6, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $6, %k0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -1625,6 +1655,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
 ; X86-NEXT:    vpmovq2m %xmm0, %k0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    knotw %k0, %k0
+; X86-NEXT:    kshiftlb $6, %k0, %k0
+; X86-NEXT:    kshiftrb $6, %k0, %k0
 ; X86-NEXT:    kmovb %k0, (%eax)
 ; X86-NEXT:    retl
   %x = xor <2 x i1> %c, <i1 1, i1 1>
@@ -1637,6 +1669,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
 ; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kshiftlw $12, %k0, %k0
+; KNL-NEXT:    kshiftrw $12, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
 ; KNL-NEXT:    vzeroupper
@@ -1647,6 +1681,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
 ; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovd2m %xmm0, %k0
 ; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kshiftlb $4, %k0, %k0
+; SKX-NEXT:    kshiftrb $4, %k0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
 ;
@@ -1654,6 +1690,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movb %al, (%rdi)
 ; AVX512BW-NEXT:    vzeroupper
@@ -1664,6 +1702,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
 ; AVX512DQ-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
 ; AVX512DQ-NEXT:    knotw %k0, %k0
+; AVX512DQ-NEXT:    kshiftlb $4, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $4, %k0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -1674,6 +1714,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
 ; X86-NEXT:    vpmovd2m %xmm0, %k0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    knotw %k0, %k0
+; X86-NEXT:    kshiftlb $4, %k0, %k0
+; X86-NEXT:    kshiftrb $4, %k0, %k0
 ; X86-NEXT:    kmovb %k0, (%eax)
 ; X86-NEXT:    retl
   %x = xor <4 x i1> %c, <i1 1, i1 1, i1 1, i1 1>
@@ -5206,6 +5248,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
 ; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kmovw %esi, %k1
 ; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    movb -{{[0-9]+}}(%rsp), %al
@@ -5220,6 +5264,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
 ; SKX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k0
 ; SKX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $7, %k0, %k0
 ; SKX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SKX-NEXT:    retq
@@ -5229,6 +5275,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
 ; AVX512BW-NEXT:    kmovd %edi, %k0
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; AVX512BW-NEXT:    movb -{{[0-9]+}}(%rsp), %al
@@ -5243,6 +5291,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
 ; AVX512DQ-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k0
 ; AVX512DQ-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
 ; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
 ; AVX512DQ-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    retq
@@ -5260,6 +5310,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    kxorw %k1, %k0, %k0
+; X86-NEXT:    kshiftlb $7, %k0, %k0
+; X86-NEXT:    kshiftrb $7, %k0, %k0
 ; X86-NEXT:    kmovb %k0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    popl %ecx
@@ -5277,6 +5329,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
 ; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kmovw %esi, %k1
 ; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    movb -{{[0-9]+}}(%rsp), %al
@@ -5291,6 +5345,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
 ; SKX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k0
 ; SKX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $7, %k0, %k0
 ; SKX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SKX-NEXT:    retq
@@ -5300,6 +5356,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
 ; AVX512BW-NEXT:    kmovd %edi, %k0
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; AVX512BW-NEXT:    movb -{{[0-9]+}}(%rsp), %al
@@ -5314,6 +5372,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
 ; AVX512DQ-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k0
 ; AVX512DQ-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
 ; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
 ; AVX512DQ-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    retq
@@ -5331,6 +5391,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    kxorw %k1, %k0, %k0
+; X86-NEXT:    kshiftlb $7, %k0, %k0
+; X86-NEXT:    kshiftrb $7, %k0, %k0
 ; X86-NEXT:    kmovb %k0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    popl %ecx
@@ -5348,6 +5410,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
 ; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kmovw %esi, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    movb -{{[0-9]+}}(%rsp), %al
@@ -5362,6 +5426,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
 ; SKX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k0
 ; SKX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
 ; SKX-NEXT:    kandw %k1, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $7, %k0, %k0
 ; SKX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SKX-NEXT:    retq
@@ -5371,6 +5437,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
 ; AVX512BW-NEXT:    kmovd %edi, %k0
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; AVX512BW-NEXT:    movb -{{[0-9]+}}(%rsp), %al
@@ -5385,6 +5453,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
 ; AVX512DQ-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k0
 ; AVX512DQ-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
 ; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $7, %k0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
 ; AVX512DQ-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; AVX512DQ-NEXT:    retq
@@ -5402,6 +5472,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    kandw %k1, %k0, %k0
+; X86-NEXT:    kshiftlb $7, %k0, %k0
+; X86-NEXT:    kshiftrb $7, %k0, %k0
 ; X86-NEXT:    kmovb %k0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    popl %ecx

diff  --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index a60f6ee06e73..8eadc360d939 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -552,6 +552,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
 ; X86-AVX512F-NEXT:    kandnw %k1, %k2, %k1
 ; X86-AVX512F-NEXT:    kandw %k2, %k0, %k0
 ; X86-AVX512F-NEXT:    korw %k1, %k0, %k0
+; X86-AVX512F-NEXT:    kshiftlw $15, %k0, %k0
+; X86-AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; X86-AVX512F-NEXT:    kmovw %k0, %eax
 ; X86-AVX512F-NEXT:    movb %al, (%edx)
 ; X86-AVX512F-NEXT:    popl %esi
@@ -568,6 +570,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
 ; X64-AVX512F-NEXT:    kandnw %k1, %k2, %k1
 ; X64-AVX512F-NEXT:    kandw %k2, %k0, %k0
 ; X64-AVX512F-NEXT:    korw %k1, %k0, %k0
+; X64-AVX512F-NEXT:    kshiftlw $15, %k0, %k0
+; X64-AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; X64-AVX512F-NEXT:    kmovw %k0, %eax
 ; X64-AVX512F-NEXT:    movb %al, (%rsi)
 ; X64-AVX512F-NEXT:    retq
@@ -587,6 +591,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
 ; X86-AVX512BW-NEXT:    kandnw %k1, %k2, %k1
 ; X86-AVX512BW-NEXT:    kandw %k2, %k0, %k0
 ; X86-AVX512BW-NEXT:    korw %k1, %k0, %k0
+; X86-AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; X86-AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; X86-AVX512BW-NEXT:    kmovd %k0, %eax
 ; X86-AVX512BW-NEXT:    movb %al, (%edx)
 ; X86-AVX512BW-NEXT:    popl %esi
@@ -603,6 +609,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
 ; X64-AVX512BW-NEXT:    kandnw %k1, %k2, %k1
 ; X64-AVX512BW-NEXT:    kandw %k2, %k0, %k0
 ; X64-AVX512BW-NEXT:    korw %k1, %k0, %k0
+; X64-AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; X64-AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; X64-AVX512BW-NEXT:    kmovd %k0, %eax
 ; X64-AVX512BW-NEXT:    movb %al, (%rsi)
 ; X64-AVX512BW-NEXT:    retq
@@ -634,6 +642,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi
 ; X86-AVX512F-NEXT:    movzbl (%eax), %ecx
 ; X86-AVX512F-NEXT:    kmovw %ecx, %k0
 ; X86-AVX512F-NEXT:  .LBB18_3:
+; X86-AVX512F-NEXT:    kshiftlw $15, %k0, %k0
+; X86-AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; X86-AVX512F-NEXT:    kmovw %k0, %ecx
 ; X86-AVX512F-NEXT:    movb %cl, (%eax)
 ; X86-AVX512F-NEXT:    retl
@@ -653,6 +663,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi
 ; X64-AVX512F-NEXT:    movzbl (%rsi), %eax
 ; X64-AVX512F-NEXT:    kmovw %eax, %k0
 ; X64-AVX512F-NEXT:  .LBB18_3:
+; X64-AVX512F-NEXT:    kshiftlw $15, %k0, %k0
+; X64-AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; X64-AVX512F-NEXT:    kmovw %k0, %eax
 ; X64-AVX512F-NEXT:    movb %al, (%rsi)
 ; X64-AVX512F-NEXT:    retq
@@ -675,6 +687,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi
 ; X86-AVX512BW-NEXT:    movzbl (%eax), %ecx
 ; X86-AVX512BW-NEXT:    kmovd %ecx, %k0
 ; X86-AVX512BW-NEXT:  .LBB18_3:
+; X86-AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; X86-AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; X86-AVX512BW-NEXT:    kmovd %k0, %ecx
 ; X86-AVX512BW-NEXT:    movb %cl, (%eax)
 ; X86-AVX512BW-NEXT:    retl
@@ -694,6 +708,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi
 ; X64-AVX512BW-NEXT:    movzbl (%rsi), %eax
 ; X64-AVX512BW-NEXT:    kmovd %eax, %k0
 ; X64-AVX512BW-NEXT:  .LBB18_3:
+; X64-AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; X64-AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; X64-AVX512BW-NEXT:    kmovd %k0, %eax
 ; X64-AVX512BW-NEXT:    movb %al, (%rsi)
 ; X64-AVX512BW-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll
index 839dd1c70351..f921d10f071d 100644
--- a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll
@@ -89,9 +89,13 @@ define void @stack_fold_vp2intersectq_256(<4 x i64>* %a, <4 x i64> %b, <4 x i1>*
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
 ; CHECK-NEXT:    vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    kmovw %k0, %ecx
-; CHECK-NEXT:    movb %cl, (%rsi)
+; CHECK-NEXT:    kshiftlw $12, %k0, %k2
+; CHECK-NEXT:    kshiftrw $12, %k2, %k2
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    movb %al, (%rsi)
+; CHECK-NEXT:    kshiftlw $12, %k1, %k0
+; CHECK-NEXT:    kshiftrw $12, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    movb %al, (%rdx)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -115,9 +119,13 @@ define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>*
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm0
 ; CHECK-NEXT:    vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    kmovw %k0, %ecx
-; CHECK-NEXT:    movb %cl, (%rsi)
+; CHECK-NEXT:    kshiftlw $12, %k0, %k2
+; CHECK-NEXT:    kshiftrw $12, %k2, %k2
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    movb %al, (%rsi)
+; CHECK-NEXT:    kshiftlw $12, %k1, %k0
+; CHECK-NEXT:    kshiftrw $12, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    movb %al, (%rdx)
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -140,9 +148,13 @@ define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>*
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm0
 ; CHECK-NEXT:    vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    kmovw %k0, %ecx
-; CHECK-NEXT:    movb %cl, (%rsi)
+; CHECK-NEXT:    kshiftlw $14, %k0, %k2
+; CHECK-NEXT:    kshiftrw $14, %k2, %k2
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    movb %al, (%rsi)
+; CHECK-NEXT:    kshiftlw $14, %k1, %k0
+; CHECK-NEXT:    kshiftrw $14, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    movb %al, (%rdx)
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()

diff  --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 8197a3521f09..8a2ed6c8c8a8 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1131,6 +1131,8 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    kxorw %k2, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 1b5aef61ebf3..746f9e5e642e 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3559,64 +3559,65 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ;
 ; AVX512-LABEL: smulo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
 ; AVX512-NEXT:    pushq %rbx
 ; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
 ; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k0
 ; AVX512-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512-NEXT:    kmovd %k1, %r9d
-; AVX512-NEXT:    andb $1, %r9b
-; AVX512-NEXT:    negb %r9b
+; AVX512-NEXT:    kmovd %k1, %r10d
+; AVX512-NEXT:    andb $1, %r10b
+; AVX512-NEXT:    negb %r10b
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512-NEXT:    kmovd %k2, %r10d
-; AVX512-NEXT:    andb $1, %r10b
-; AVX512-NEXT:    negb %r10b
+; AVX512-NEXT:    kmovd %k2, %r9d
+; AVX512-NEXT:    andb $1, %r9b
+; AVX512-NEXT:    negb %r9b
 ; AVX512-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512-NEXT:    kmovd %k2, %r11d
-; AVX512-NEXT:    andb $1, %r11b
-; AVX512-NEXT:    negb %r11b
+; AVX512-NEXT:    kmovd %k2, %ebp
+; AVX512-NEXT:    andb $1, %bpl
+; AVX512-NEXT:    negb %bpl
 ; AVX512-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512-NEXT:    kmovd %k2, %ebx
-; AVX512-NEXT:    andb $1, %bl
-; AVX512-NEXT:    negb %bl
+; AVX512-NEXT:    kmovd %k2, %edx
+; AVX512-NEXT:    andb $1, %dl
+; AVX512-NEXT:    negb %dl
 ; AVX512-NEXT:    kshiftrw $1, %k0, %k2
 ; AVX512-NEXT:    kmovd %k2, %esi
 ; AVX512-NEXT:    andb $1, %sil
 ; AVX512-NEXT:    negb %sil
 ; AVX512-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512-NEXT:    kmovd %k2, %edx
-; AVX512-NEXT:    andb $1, %dl
-; AVX512-NEXT:    negb %dl
+; AVX512-NEXT:    kmovd %k2, %ecx
+; AVX512-NEXT:    andb $1, %cl
+; AVX512-NEXT:    negb %cl
 ; AVX512-NEXT:    kmovd %k1, %eax
 ; AVX512-NEXT:    andb $1, %al
 ; AVX512-NEXT:    negb %al
-; AVX512-NEXT:    kmovd %k0, %ecx
-; AVX512-NEXT:    andb $1, %cl
-; AVX512-NEXT:    negb %cl
+; AVX512-NEXT:    kmovd %k0, %ebx
+; AVX512-NEXT:    andb $1, %bl
+; AVX512-NEXT:    negb %bl
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    imulb %cl
-; AVX512-NEXT:    movl %eax, %r8d
+; AVX512-NEXT:    imulb %bl
+; AVX512-NEXT:    movl %eax, %r11d
 ; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    movl %r8d, %ecx
-; AVX512-NEXT:    andb $1, %cl
-; AVX512-NEXT:    negb %cl
-; AVX512-NEXT:    cmpb %r8b, %cl
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    orb %al, %cl
+; AVX512-NEXT:    movl %r11d, %ebx
+; AVX512-NEXT:    andb $1, %bl
+; AVX512-NEXT:    negb %bl
+; AVX512-NEXT:    cmpb %r11b, %bl
+; AVX512-NEXT:    setne %bl
+; AVX512-NEXT:    orb %al, %bl
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    kmovd %eax, %k1
 ; AVX512-NEXT:    movw $-3, %ax
 ; AVX512-NEXT:    kmovd %eax, %k0
 ; AVX512-NEXT:    kandw %k0, %k1, %k1
-; AVX512-NEXT:    movl %edx, %eax
+; AVX512-NEXT:    movl %ecx, %eax
 ; AVX512-NEXT:    imulb %sil
-; AVX512-NEXT:    movl %eax, %edx
+; AVX512-NEXT:    movl %eax, %r8d
 ; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    movl %edx, %ecx
+; AVX512-NEXT:    movl %r8d, %ecx
 ; AVX512-NEXT:    andb $1, %cl
 ; AVX512-NEXT:    negb %cl
-; AVX512-NEXT:    cmpb %dl, %cl
+; AVX512-NEXT:    cmpb %r8b, %cl
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    orb %al, %cl
 ; AVX512-NEXT:    setne %al
@@ -3627,8 +3628,8 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    movw $-5, %ax
 ; AVX512-NEXT:    kmovd %eax, %k1
 ; AVX512-NEXT:    kandw %k1, %k2, %k2
-; AVX512-NEXT:    movl %r11d, %eax
-; AVX512-NEXT:    imulb %bl
+; AVX512-NEXT:    movl %ebp, %eax
+; AVX512-NEXT:    imulb %dl
 ; AVX512-NEXT:    movl %eax, %esi
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    movl %esi, %ecx
@@ -3643,25 +3644,26 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    korw %k3, %k2, %k2
 ; AVX512-NEXT:    kshiftlw $13, %k2, %k2
 ; AVX512-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512-NEXT:    movl %r10d, %eax
-; AVX512-NEXT:    imulb %r9b
+; AVX512-NEXT:    movl %r9d, %eax
+; AVX512-NEXT:    imulb %r10b
 ; AVX512-NEXT:    # kill: def $al killed $al def $eax
 ; AVX512-NEXT:    seto %cl
-; AVX512-NEXT:    movl %eax, %ebx
-; AVX512-NEXT:    andb $1, %bl
-; AVX512-NEXT:    negb %bl
-; AVX512-NEXT:    cmpb %al, %bl
-; AVX512-NEXT:    setne %bl
-; AVX512-NEXT:    orb %cl, %bl
+; AVX512-NEXT:    movl %eax, %edx
+; AVX512-NEXT:    andb $1, %dl
+; AVX512-NEXT:    negb %dl
+; AVX512-NEXT:    cmpb %al, %dl
+; AVX512-NEXT:    setne %dl
+; AVX512-NEXT:    orb %cl, %dl
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    kmovd %ecx, %k3
 ; AVX512-NEXT:    kshiftlw $3, %k3, %k3
 ; AVX512-NEXT:    korw %k3, %k2, %k2
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT:    kmovd %r8d, %k2
+; AVX512-NEXT:    andl $1, %r11d
+; AVX512-NEXT:    kmovw %r11d, %k2
 ; AVX512-NEXT:    kandw %k0, %k2, %k0
-; AVX512-NEXT:    kmovd %edx, %k2
+; AVX512-NEXT:    kmovd %r8d, %k2
 ; AVX512-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512-NEXT:    kshiftrw $14, %k2, %k2
 ; AVX512-NEXT:    korw %k2, %k0, %k0
@@ -3680,6 +3682,7 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
   %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
   %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0

diff  --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index d4d458560fa6..f9dbaccfbd72 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -1140,6 +1140,8 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    kxorw %k2, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index c34653be4a02..5954f1195741 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1180,7 +1180,9 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    kandnw %k0, %k1, %k2
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT:    kmovd %k1, %eax
+; AVX512-NEXT:    kshiftlw $12, %k1, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)

diff  --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index cc25fd5bec78..4766fe90a3d4 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -3188,52 +3188,53 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ;
 ; AVX512-LABEL: umulo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
 ; AVX512-NEXT:    pushq %rbx
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; AVX512-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512-NEXT:    kmovd %k1, %r9d
-; AVX512-NEXT:    andb $1, %r9b
+; AVX512-NEXT:    kmovd %k1, %r8d
+; AVX512-NEXT:    andb $1, %r8b
 ; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0
 ; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512-NEXT:    kmovd %k2, %r9d
+; AVX512-NEXT:    andb $1, %r9b
+; AVX512-NEXT:    kshiftrw $2, %k0, %k2
 ; AVX512-NEXT:    kmovd %k2, %r10d
 ; AVX512-NEXT:    andb $1, %r10b
-; AVX512-NEXT:    kshiftrw $2, %k0, %k2
+; AVX512-NEXT:    kshiftrw $2, %k1, %k2
 ; AVX512-NEXT:    kmovd %k2, %r11d
 ; AVX512-NEXT:    andb $1, %r11b
-; AVX512-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512-NEXT:    kmovd %k2, %ebx
-; AVX512-NEXT:    andb $1, %bl
 ; AVX512-NEXT:    kshiftrw $1, %k0, %k2
-; AVX512-NEXT:    kmovd %k2, %edx
-; AVX512-NEXT:    andb $1, %dl
+; AVX512-NEXT:    kmovd %k2, %ecx
+; AVX512-NEXT:    andb $1, %cl
 ; AVX512-NEXT:    kshiftrw $1, %k1, %k2
 ; AVX512-NEXT:    kmovd %k2, %esi
 ; AVX512-NEXT:    andb $1, %sil
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    kmovd %k1, %ecx
-; AVX512-NEXT:    andb $1, %cl
+; AVX512-NEXT:    kmovd %k1, %edx
+; AVX512-NEXT:    andb $1, %dl
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    mulb %cl
-; AVX512-NEXT:    movl %eax, %r8d
+; AVX512-NEXT:    mulb %dl
+; AVX512-NEXT:    movl %eax, %edx
 ; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    testb $-2, %r8b
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    orb %al, %cl
+; AVX512-NEXT:    testb $-2, %dl
+; AVX512-NEXT:    setne %bl
+; AVX512-NEXT:    orb %al, %bl
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    kmovd %eax, %k1
 ; AVX512-NEXT:    movw $-3, %ax
 ; AVX512-NEXT:    kmovd %eax, %k0
 ; AVX512-NEXT:    kandw %k0, %k1, %k1
-; AVX512-NEXT:    movl %edx, %eax
+; AVX512-NEXT:    movl %ecx, %eax
 ; AVX512-NEXT:    mulb %sil
-; AVX512-NEXT:    movl %eax, %edx
+; AVX512-NEXT:    movl %eax, %ebp
 ; AVX512-NEXT:    seto %al
-; AVX512-NEXT:    testb $-2, %dl
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    orb %al, %cl
+; AVX512-NEXT:    testb $-2, %bpl
+; AVX512-NEXT:    setne %bl
+; AVX512-NEXT:    orb %al, %bl
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    kmovd %eax, %k2
 ; AVX512-NEXT:    kshiftlw $15, %k2, %k2
@@ -3242,35 +3243,36 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    movw $-5, %ax
 ; AVX512-NEXT:    kmovd %eax, %k1
 ; AVX512-NEXT:    kandw %k1, %k2, %k2
-; AVX512-NEXT:    movl %r11d, %eax
-; AVX512-NEXT:    mulb %bl
+; AVX512-NEXT:    movl %r10d, %eax
+; AVX512-NEXT:    mulb %r11b
 ; AVX512-NEXT:    movl %eax, %esi
 ; AVX512-NEXT:    seto %al
 ; AVX512-NEXT:    testb $-2, %sil
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    orb %al, %cl
+; AVX512-NEXT:    setne %bl
+; AVX512-NEXT:    orb %al, %bl
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    kmovd %eax, %k3
 ; AVX512-NEXT:    kshiftlw $2, %k3, %k3
 ; AVX512-NEXT:    korw %k3, %k2, %k2
 ; AVX512-NEXT:    kshiftlw $13, %k2, %k2
 ; AVX512-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512-NEXT:    movl %r9d, %eax
-; AVX512-NEXT:    mulb %r10b
+; AVX512-NEXT:    movl %r8d, %eax
+; AVX512-NEXT:    mulb %r9b
 ; AVX512-NEXT:    # kill: def $al killed $al def $eax
-; AVX512-NEXT:    seto %cl
+; AVX512-NEXT:    seto %bl
 ; AVX512-NEXT:    testb $-2, %al
-; AVX512-NEXT:    setne %bl
-; AVX512-NEXT:    orb %cl, %bl
+; AVX512-NEXT:    setne %cl
+; AVX512-NEXT:    orb %bl, %cl
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    kmovd %ecx, %k3
 ; AVX512-NEXT:    kshiftlw $3, %k3, %k3
 ; AVX512-NEXT:    korw %k3, %k2, %k2
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT:    kmovd %r8d, %k2
+; AVX512-NEXT:    andl $1, %edx
+; AVX512-NEXT:    kmovw %edx, %k2
 ; AVX512-NEXT:    kandw %k0, %k2, %k0
-; AVX512-NEXT:    kmovd %edx, %k2
+; AVX512-NEXT:    kmovd %ebp, %k2
 ; AVX512-NEXT:    kshiftlw $15, %k2, %k2
 ; AVX512-NEXT:    kshiftrw $14, %k2, %k2
 ; AVX512-NEXT:    korw %k2, %k0, %k0
@@ -3289,6 +3291,7 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
   %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
   %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0

diff  --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 76c3e5ad3290..afb0f6cce29c 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -1227,7 +1227,9 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k2 {%k1}
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT:    kmovd %k1, %eax
+; AVX512-NEXT:    kshiftlw $12, %k1, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)


        


More information about the llvm-commits mailing list