[llvm] a4124e4 - [X86] When storing v1i1/v2i1/v4i1 to memory, make sure we store zeros in the rest of the byte
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 12 21:28:54 PST 2020
Author: Craig Topper
Date: 2020-11-12T21:28:18-08:00
New Revision: a4124e455e641db1e18d4221d2dacb31953fd13b
URL: https://github.com/llvm/llvm-project/commit/a4124e455e641db1e18d4221d2dacb31953fd13b
DIFF: https://github.com/llvm/llvm-project/commit/a4124e455e641db1e18d4221d2dacb31953fd13b.diff
LOG: [X86] When storing v1i1/v2i1/v4i1 to memory, make sure we store zeros in the rest of the byte
We can't store garbage in the unused bits. It possible that something like zextload from i1/i2/i4 is created to read the memory. Those zextloads would be legalized assuming the extra bits are 0.
I'm not sure that the code in lowerStore is executed for the v1i1/v2i1/v4i1 case. It looks like the DAG combine in combineStore may have converted them to v8i1 first. And I think we're missing some cases to avoid going to the stack in the first place. But I don't have time to investigate those things at the moment so I wanted to focus on the correctness issue.
Should fix PR48147.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D91294
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrAVX512.td
llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
llvm/test/CodeGen/X86/avx512-mask-op.ll
llvm/test/CodeGen/X86/avx512-select.ll
llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll
llvm/test/CodeGen/X86/vec_saddo.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/vec_ssubo.ll
llvm/test/CodeGen/X86/vec_uaddo.ll
llvm/test/CodeGen/X86/vec_umulo.ll
llvm/test/CodeGen/X86/vec_usubo.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5b0e9fa75351..f1956d77d615 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23870,17 +23870,22 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
if (StoredVal.getValueType().isVector() &&
StoredVal.getValueType().getVectorElementType() == MVT::i1) {
- assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
- "Unexpected VT");
+ unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
+ assert(NumElts <= 8 && "Unexpected VT");
assert(!St->isTruncatingStore() && "Expected non-truncating store");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
+ // We must pad with zeros to ensure we store zeroes to any unused bits.
StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
DAG.getUNDEF(MVT::v16i1), StoredVal,
DAG.getIntPtrConstant(0, dl));
StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
+ // Make sure we store zeros in the extra bits.
+ if (NumElts < 8)
+ StoredVal = DAG.getZeroExtendInReg(StoredVal, dl,
+ MVT::getIntegerVT(NumElts));
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
@@ -44971,17 +44976,21 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
StoredVal.getOperand(0).getValueType() == MVT::i8) {
- return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
+ SDValue Val = StoredVal.getOperand(0);
+ // We must store zeros to the unused bits.
+ Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
+ return DAG.getStore(St->getChain(), dl, Val,
St->getBasePtr(), St->getPointerInfo(),
St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
// Widen v2i1/v4i1 stores to v8i1.
- if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
+ if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
Subtarget.hasAVX512()) {
unsigned NumConcats = 8 / VT.getVectorNumElements();
- SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
+ // We must store zeros to the unused bits.
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
Ops[0] = StoredVal;
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 3d8fbdc3b82b..339fd0016435 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2921,9 +2921,6 @@ def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
// Load/store kreg
let Predicates = [HasDQI] in {
- def : Pat<(store VK1:$src, addr:$dst),
- (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
-
def : Pat<(v1i1 (load addr:$src)),
(COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>;
def : Pat<(v2i1 (load addr:$src)),
diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
index 7b7ddf72123b..deed569c3165 100644
--- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -593,6 +593,8 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
; AVX512-NEXT: kshiftrb $1, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -600,6 +602,8 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -619,6 +623,8 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
; AVX512-NEXT: cmovel %ecx, %eax
; AVX512-NEXT: kmovd %eax, %k0
; AVX512-NEXT: kshiftrb $1, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -632,6 +638,8 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ-NEXT: cmovel %ecx, %eax
; AVX512NOTDQ-NEXT: kmovd %eax, %k0
; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -649,6 +657,8 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
; AVX512-NEXT: cmovel %eax, %ecx
; AVX512-NEXT: kmovd %ecx, %k0
; AVX512-NEXT: kshiftrb $2, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -660,6 +670,8 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ-NEXT: cmovel %eax, %ecx
; AVX512NOTDQ-NEXT: kmovd %ecx, %k0
; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -673,6 +685,8 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
; AVX512-NEXT: kshiftrb $2, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -680,6 +694,8 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -693,6 +709,8 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
; AVX512-NEXT: kshiftrb $3, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -700,6 +718,8 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -713,6 +733,8 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
; AVX512-NEXT: kshiftrb $4, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -720,6 +742,8 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -760,6 +784,8 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
; AVX512-NEXT: kshiftrb $7, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -767,6 +793,8 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -807,6 +835,8 @@ define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovw (%rdi), %k0
; AVX512-NEXT: kshiftrw $8, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -814,6 +844,8 @@ define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -881,6 +913,8 @@ define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovw (%rdi), %k0
; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -888,6 +922,8 @@ define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -955,6 +991,8 @@ define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $16, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -962,6 +1000,8 @@ define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -1056,6 +1096,8 @@ define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $31, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -1063,6 +1105,8 @@ define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -1160,6 +1204,8 @@ define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -1167,6 +1213,8 @@ define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
@@ -1286,6 +1334,8 @@ define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $63, %k0, %k0
+; AVX512-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -1293,6 +1343,8 @@ define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
; AVX512NOTDQ: # %bb.0:
; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
; AVX512NOTDQ-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll b/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
index c3bcebe00e39..95e216632f6a 100644
--- a/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
+++ b/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
@@ -5,13 +5,18 @@
define void @load_v1i2_trunc_v1i1_store(<1 x i2>* %a0,<1 x i1>* %a1) {
; AVX512-ALL-LABEL: load_v1i2_trunc_v1i1_store:
; AVX512-ALL: # %bb.0:
-; AVX512-ALL-NEXT: movb (%rdi), %al
-; AVX512-ALL-NEXT: movb %al, (%rsi)
+; AVX512-ALL-NEXT: kmovb (%rdi), %k0
+; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
; AVX512-ALL-NEXT: retq
;
; AVX512-ONLY-LABEL: load_v1i2_trunc_v1i1_store:
; AVX512-ONLY: # %bb.0:
; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: andl $1, %eax
+; AVX512-ONLY-NEXT: kmovw %eax, %k0
+; AVX512-ONLY-NEXT: kmovw %k0, %eax
; AVX512-ONLY-NEXT: movb %al, (%rsi)
; AVX512-ONLY-NEXT: retq
%d0 = load <1 x i2>, <1 x i2>* %a0
@@ -22,13 +27,18 @@ define void @load_v1i2_trunc_v1i1_store(<1 x i2>* %a0,<1 x i1>* %a1) {
define void @load_v1i3_trunc_v1i1_store(<1 x i3>* %a0,<1 x i1>* %a1) {
; AVX512-ALL-LABEL: load_v1i3_trunc_v1i1_store:
; AVX512-ALL: # %bb.0:
-; AVX512-ALL-NEXT: movb (%rdi), %al
-; AVX512-ALL-NEXT: movb %al, (%rsi)
+; AVX512-ALL-NEXT: kmovb (%rdi), %k0
+; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
; AVX512-ALL-NEXT: retq
;
; AVX512-ONLY-LABEL: load_v1i3_trunc_v1i1_store:
; AVX512-ONLY: # %bb.0:
; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: andl $1, %eax
+; AVX512-ONLY-NEXT: kmovw %eax, %k0
+; AVX512-ONLY-NEXT: kmovw %k0, %eax
; AVX512-ONLY-NEXT: movb %al, (%rsi)
; AVX512-ONLY-NEXT: retq
%d0 = load <1 x i3>, <1 x i3>* %a0
@@ -39,13 +49,18 @@ define void @load_v1i3_trunc_v1i1_store(<1 x i3>* %a0,<1 x i1>* %a1) {
define void @load_v1i4_trunc_v1i1_store(<1 x i4>* %a0,<1 x i1>* %a1) {
; AVX512-ALL-LABEL: load_v1i4_trunc_v1i1_store:
; AVX512-ALL: # %bb.0:
-; AVX512-ALL-NEXT: movb (%rdi), %al
-; AVX512-ALL-NEXT: movb %al, (%rsi)
+; AVX512-ALL-NEXT: kmovb (%rdi), %k0
+; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
; AVX512-ALL-NEXT: retq
;
; AVX512-ONLY-LABEL: load_v1i4_trunc_v1i1_store:
; AVX512-ONLY: # %bb.0:
; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: andl $1, %eax
+; AVX512-ONLY-NEXT: kmovw %eax, %k0
+; AVX512-ONLY-NEXT: kmovw %k0, %eax
; AVX512-ONLY-NEXT: movb %al, (%rsi)
; AVX512-ONLY-NEXT: retq
%d0 = load <1 x i4>, <1 x i4>* %a0
@@ -56,13 +71,18 @@ define void @load_v1i4_trunc_v1i1_store(<1 x i4>* %a0,<1 x i1>* %a1) {
define void @load_v1i8_trunc_v1i1_store(<1 x i8>* %a0,<1 x i1>* %a1) {
; AVX512-ALL-LABEL: load_v1i8_trunc_v1i1_store:
; AVX512-ALL: # %bb.0:
-; AVX512-ALL-NEXT: movb (%rdi), %al
-; AVX512-ALL-NEXT: movb %al, (%rsi)
+; AVX512-ALL-NEXT: kmovb (%rdi), %k0
+; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
; AVX512-ALL-NEXT: retq
;
; AVX512-ONLY-LABEL: load_v1i8_trunc_v1i1_store:
; AVX512-ONLY: # %bb.0:
; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: andl $1, %eax
+; AVX512-ONLY-NEXT: kmovw %eax, %k0
+; AVX512-ONLY-NEXT: kmovw %k0, %eax
; AVX512-ONLY-NEXT: movb %al, (%rsi)
; AVX512-ONLY-NEXT: retq
%d0 = load <1 x i8>, <1 x i8>* %a0
@@ -73,13 +93,18 @@ define void @load_v1i8_trunc_v1i1_store(<1 x i8>* %a0,<1 x i1>* %a1) {
define void @load_v1i16_trunc_v1i1_store(<1 x i16>* %a0,<1 x i1>* %a1) {
; AVX512-ALL-LABEL: load_v1i16_trunc_v1i1_store:
; AVX512-ALL: # %bb.0:
-; AVX512-ALL-NEXT: movb (%rdi), %al
-; AVX512-ALL-NEXT: movb %al, (%rsi)
+; AVX512-ALL-NEXT: kmovb (%rdi), %k0
+; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
; AVX512-ALL-NEXT: retq
;
; AVX512-ONLY-LABEL: load_v1i16_trunc_v1i1_store:
; AVX512-ONLY: # %bb.0:
; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: andl $1, %eax
+; AVX512-ONLY-NEXT: kmovw %eax, %k0
+; AVX512-ONLY-NEXT: kmovw %k0, %eax
; AVX512-ONLY-NEXT: movb %al, (%rsi)
; AVX512-ONLY-NEXT: retq
%d0 = load <1 x i16>, <1 x i16>* %a0
@@ -90,13 +115,18 @@ define void @load_v1i16_trunc_v1i1_store(<1 x i16>* %a0,<1 x i1>* %a1) {
define void @load_v1i32_trunc_v1i1_store(<1 x i32>* %a0,<1 x i1>* %a1) {
; AVX512-ALL-LABEL: load_v1i32_trunc_v1i1_store:
; AVX512-ALL: # %bb.0:
-; AVX512-ALL-NEXT: movb (%rdi), %al
-; AVX512-ALL-NEXT: movb %al, (%rsi)
+; AVX512-ALL-NEXT: kmovb (%rdi), %k0
+; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
; AVX512-ALL-NEXT: retq
;
; AVX512-ONLY-LABEL: load_v1i32_trunc_v1i1_store:
; AVX512-ONLY: # %bb.0:
; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: andl $1, %eax
+; AVX512-ONLY-NEXT: kmovw %eax, %k0
+; AVX512-ONLY-NEXT: kmovw %k0, %eax
; AVX512-ONLY-NEXT: movb %al, (%rsi)
; AVX512-ONLY-NEXT: retq
%d0 = load <1 x i32>, <1 x i32>* %a0
@@ -107,13 +137,18 @@ define void @load_v1i32_trunc_v1i1_store(<1 x i32>* %a0,<1 x i1>* %a1) {
define void @load_v1i64_trunc_v1i1_store(<1 x i64>* %a0,<1 x i1>* %a1) {
; AVX512-ALL-LABEL: load_v1i64_trunc_v1i1_store:
; AVX512-ALL: # %bb.0:
-; AVX512-ALL-NEXT: movb (%rdi), %al
-; AVX512-ALL-NEXT: movb %al, (%rsi)
+; AVX512-ALL-NEXT: kmovb (%rdi), %k0
+; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0
+; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
; AVX512-ALL-NEXT: retq
;
; AVX512-ONLY-LABEL: load_v1i64_trunc_v1i1_store:
; AVX512-ONLY: # %bb.0:
; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: andl $1, %eax
+; AVX512-ONLY-NEXT: kmovw %eax, %k0
+; AVX512-ONLY-NEXT: kmovw %k0, %eax
; AVX512-ONLY-NEXT: movb %al, (%rsi)
; AVX512-ONLY-NEXT: retq
%d0 = load <1 x i64>, <1 x i64>* %a0
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 8fa7fcc21e62..5df6842994f0 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1455,6 +1455,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
; KNL: ## %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $12, %k0, %k0
+; KNL-NEXT: kshiftrw $12, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
@@ -1471,6 +1473,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT: kshiftlw $12, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
@@ -1480,6 +1484,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT: kshiftlb $4, %k0, %k0
+; AVX512DQ-NEXT: kshiftrb $4, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1500,6 +1506,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
; KNL: ## %bb.0:
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k0
+; KNL-NEXT: kshiftrw $14, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
@@ -1516,6 +1524,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $14, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
@@ -1525,6 +1535,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
+; AVX512DQ-NEXT: kshiftlb $6, %k0, %k0
+; AVX512DQ-NEXT: kshiftrb $6, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1545,6 +1557,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: knotw %k0, %k0
+; KNL-NEXT: kshiftlw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rsi)
; KNL-NEXT: retq
@@ -1553,6 +1567,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: knotw %k0, %k0
+; SKX-NEXT: kshiftlb $7, %k0, %k0
+; SKX-NEXT: kshiftrb $7, %k0, %k0
; SKX-NEXT: kmovb %k0, (%rsi)
; SKX-NEXT: retq
;
@@ -1560,6 +1576,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: knotw %k0, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $15, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rsi)
; AVX512BW-NEXT: retq
@@ -1568,6 +1586,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: knotw %k0, %k0
+; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0
+; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rsi)
; AVX512DQ-NEXT: retq
;
@@ -1576,6 +1596,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: knotw %k0, %k0
+; X86-NEXT: kshiftlb $7, %k0, %k0
+; X86-NEXT: kshiftrb $7, %k0, %k0
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
%x = xor <1 x i1> %c, <i1 1>
@@ -1588,6 +1610,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
; KNL: ## %bb.0:
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k0
+; KNL-NEXT: kshiftrw $14, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
@@ -1598,6 +1622,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vpmovq2m %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
+; SKX-NEXT: kshiftlb $6, %k0, %k0
+; SKX-NEXT: kshiftrb $6, %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
@@ -1605,6 +1631,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $14, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
@@ -1615,6 +1643,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: knotw %k0, %k0
+; AVX512DQ-NEXT: kshiftlb $6, %k0, %k0
+; AVX512DQ-NEXT: kshiftrb $6, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1625,6 +1655,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
; X86-NEXT: vpmovq2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: knotw %k0, %k0
+; X86-NEXT: kshiftlb $6, %k0, %k0
+; X86-NEXT: kshiftrb $6, %k0, %k0
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
%x = xor <2 x i1> %c, <i1 1, i1 1>
@@ -1637,6 +1669,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
; KNL: ## %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $12, %k0, %k0
+; KNL-NEXT: kshiftrw $12, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: vzeroupper
@@ -1647,6 +1681,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vpmovd2m %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
+; SKX-NEXT: kshiftlb $4, %k0, %k0
+; SKX-NEXT: kshiftrb $4, %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
@@ -1654,6 +1690,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT: kshiftlw $12, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: vzeroupper
@@ -1664,6 +1702,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: knotw %k0, %k0
+; AVX512DQ-NEXT: kshiftlb $4, %k0, %k0
+; AVX512DQ-NEXT: kshiftrb $4, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1674,6 +1714,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
; X86-NEXT: vpmovd2m %xmm0, %k0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: knotw %k0, %k0
+; X86-NEXT: kshiftlb $4, %k0, %k0
+; X86-NEXT: kshiftrb $4, %k0, %k0
; X86-NEXT: kmovb %k0, (%eax)
; X86-NEXT: retl
%x = xor <4 x i1> %c, <i1 1, i1 1, i1 1, i1 1>
@@ -5206,6 +5248,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
+; KNL-NEXT: kshiftlw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; KNL-NEXT: movb -{{[0-9]+}}(%rsp), %al
@@ -5220,6 +5264,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0
; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
+; SKX-NEXT: kshiftlb $7, %k0, %k0
+; SKX-NEXT: kshiftrb $7, %k0, %k0
; SKX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SKX-NEXT: retq
@@ -5229,6 +5275,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $15, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
@@ -5243,6 +5291,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0
; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
+; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0
+; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; AVX512DQ-NEXT: movb -{{[0-9]+}}(%rsp), %al
; AVX512DQ-NEXT: retq
@@ -5260,6 +5310,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: kxorw %k1, %k0, %k0
+; X86-NEXT: kshiftlb $7, %k0, %k0
+; X86-NEXT: kshiftrb $7, %k0, %k0
; X86-NEXT: kmovb %k0, {{[0-9]+}}(%esp)
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: popl %ecx
@@ -5277,6 +5329,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
+; KNL-NEXT: kshiftlw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; KNL-NEXT: movb -{{[0-9]+}}(%rsp), %al
@@ -5291,6 +5345,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0
; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
+; SKX-NEXT: kshiftlb $7, %k0, %k0
+; SKX-NEXT: kshiftrb $7, %k0, %k0
; SKX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SKX-NEXT: retq
@@ -5300,6 +5356,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $15, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
@@ -5314,6 +5372,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0
; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
+; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0
+; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; AVX512DQ-NEXT: movb -{{[0-9]+}}(%rsp), %al
; AVX512DQ-NEXT: retq
@@ -5331,6 +5391,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: kxorw %k1, %k0, %k0
+; X86-NEXT: kshiftlb $7, %k0, %k0
+; X86-NEXT: kshiftrb $7, %k0, %k0
; X86-NEXT: kmovb %k0, {{[0-9]+}}(%esp)
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: popl %ecx
@@ -5348,6 +5410,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kandw %k1, %k0, %k0
+; KNL-NEXT: kshiftlw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; KNL-NEXT: movb -{{[0-9]+}}(%rsp), %al
@@ -5362,6 +5426,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0
; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: kshiftlb $7, %k0, %k0
+; SKX-NEXT: kshiftrb $7, %k0, %k0
; SKX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SKX-NEXT: retq
@@ -5371,6 +5437,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $15, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
@@ -5385,6 +5453,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0
; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
; AVX512DQ-NEXT: kandw %k1, %k0, %k0
+; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0
+; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; AVX512DQ-NEXT: movb -{{[0-9]+}}(%rsp), %al
; AVX512DQ-NEXT: retq
@@ -5402,6 +5472,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: kandw %k1, %k0, %k0
+; X86-NEXT: kshiftlb $7, %k0, %k0
+; X86-NEXT: kshiftrb $7, %k0, %k0
; X86-NEXT: kmovb %k0, {{[0-9]+}}(%esp)
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: popl %ecx
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index a60f6ee06e73..8eadc360d939 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -552,6 +552,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
; X86-AVX512F-NEXT: kandnw %k1, %k2, %k1
; X86-AVX512F-NEXT: kandw %k2, %k0, %k0
; X86-AVX512F-NEXT: korw %k1, %k0, %k0
+; X86-AVX512F-NEXT: kshiftlw $15, %k0, %k0
+; X86-AVX512F-NEXT: kshiftrw $15, %k0, %k0
; X86-AVX512F-NEXT: kmovw %k0, %eax
; X86-AVX512F-NEXT: movb %al, (%edx)
; X86-AVX512F-NEXT: popl %esi
@@ -568,6 +570,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
; X64-AVX512F-NEXT: kandnw %k1, %k2, %k1
; X64-AVX512F-NEXT: kandw %k2, %k0, %k0
; X64-AVX512F-NEXT: korw %k1, %k0, %k0
+; X64-AVX512F-NEXT: kshiftlw $15, %k0, %k0
+; X64-AVX512F-NEXT: kshiftrw $15, %k0, %k0
; X64-AVX512F-NEXT: kmovw %k0, %eax
; X64-AVX512F-NEXT: movb %al, (%rsi)
; X64-AVX512F-NEXT: retq
@@ -587,6 +591,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
; X86-AVX512BW-NEXT: kandnw %k1, %k2, %k1
; X86-AVX512BW-NEXT: kandw %k2, %k0, %k0
; X86-AVX512BW-NEXT: korw %k1, %k0, %k0
+; X86-AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; X86-AVX512BW-NEXT: kshiftrw $15, %k0, %k0
; X86-AVX512BW-NEXT: kmovd %k0, %eax
; X86-AVX512BW-NEXT: movb %al, (%edx)
; X86-AVX512BW-NEXT: popl %esi
@@ -603,6 +609,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
; X64-AVX512BW-NEXT: kandnw %k1, %k2, %k1
; X64-AVX512BW-NEXT: kandw %k2, %k0, %k0
; X64-AVX512BW-NEXT: korw %k1, %k0, %k0
+; X64-AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; X64-AVX512BW-NEXT: kshiftrw $15, %k0, %k0
; X64-AVX512BW-NEXT: kmovd %k0, %eax
; X64-AVX512BW-NEXT: movb %al, (%rsi)
; X64-AVX512BW-NEXT: retq
@@ -634,6 +642,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi
; X86-AVX512F-NEXT: movzbl (%eax), %ecx
; X86-AVX512F-NEXT: kmovw %ecx, %k0
; X86-AVX512F-NEXT: .LBB18_3:
+; X86-AVX512F-NEXT: kshiftlw $15, %k0, %k0
+; X86-AVX512F-NEXT: kshiftrw $15, %k0, %k0
; X86-AVX512F-NEXT: kmovw %k0, %ecx
; X86-AVX512F-NEXT: movb %cl, (%eax)
; X86-AVX512F-NEXT: retl
@@ -653,6 +663,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi
; X64-AVX512F-NEXT: movzbl (%rsi), %eax
; X64-AVX512F-NEXT: kmovw %eax, %k0
; X64-AVX512F-NEXT: .LBB18_3:
+; X64-AVX512F-NEXT: kshiftlw $15, %k0, %k0
+; X64-AVX512F-NEXT: kshiftrw $15, %k0, %k0
; X64-AVX512F-NEXT: kmovw %k0, %eax
; X64-AVX512F-NEXT: movb %al, (%rsi)
; X64-AVX512F-NEXT: retq
@@ -675,6 +687,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi
; X86-AVX512BW-NEXT: movzbl (%eax), %ecx
; X86-AVX512BW-NEXT: kmovd %ecx, %k0
; X86-AVX512BW-NEXT: .LBB18_3:
+; X86-AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; X86-AVX512BW-NEXT: kshiftrw $15, %k0, %k0
; X86-AVX512BW-NEXT: kmovd %k0, %ecx
; X86-AVX512BW-NEXT: movb %cl, (%eax)
; X86-AVX512BW-NEXT: retl
@@ -694,6 +708,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi
; X64-AVX512BW-NEXT: movzbl (%rsi), %eax
; X64-AVX512BW-NEXT: kmovd %eax, %k0
; X64-AVX512BW-NEXT: .LBB18_3:
+; X64-AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; X64-AVX512BW-NEXT: kshiftrw $15, %k0, %k0
; X64-AVX512BW-NEXT: kmovd %k0, %eax
; X64-AVX512BW-NEXT: movb %al, (%rsi)
; X64-AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll
index 839dd1c70351..f921d10f071d 100644
--- a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll
@@ -89,9 +89,13 @@ define void @stack_fold_vp2intersectq_256(<4 x i64>* %a, <4 x i64> %b, <4 x i1>*
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %ymm0
; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
-; CHECK-NEXT: kmovw %k1, %eax
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: movb %cl, (%rsi)
+; CHECK-NEXT: kshiftlw $12, %k0, %k2
+; CHECK-NEXT: kshiftrw $12, %k2, %k2
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: movb %al, (%rsi)
+; CHECK-NEXT: kshiftlw $12, %k1, %k0
+; CHECK-NEXT: kshiftrw $12, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: movb %al, (%rdx)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -115,9 +119,13 @@ define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>*
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
-; CHECK-NEXT: kmovw %k1, %eax
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: movb %cl, (%rsi)
+; CHECK-NEXT: kshiftlw $12, %k0, %k2
+; CHECK-NEXT: kshiftrw $12, %k2, %k2
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: movb %al, (%rsi)
+; CHECK-NEXT: kshiftlw $12, %k1, %k0
+; CHECK-NEXT: kshiftrw $12, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: movb %al, (%rdx)
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -140,9 +148,13 @@ define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>*
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
-; CHECK-NEXT: kmovw %k1, %eax
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: movb %cl, (%rsi)
+; CHECK-NEXT: kshiftlw $14, %k0, %k2
+; CHECK-NEXT: kshiftrw $14, %k2, %k2
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: movb %al, (%rsi)
+; CHECK-NEXT: kshiftlw $14, %k1, %k0
+; CHECK-NEXT: kshiftrw $14, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: movb %al, (%rdx)
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 8197a3521f09..8a2ed6c8c8a8 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1131,6 +1131,8 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: kxorw %k2, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 1b5aef61ebf3..746f9e5e642e 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3559,64 +3559,65 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
;
; AVX512-LABEL: smulo_v4i1:
; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0
; AVX512-NEXT: kshiftrw $3, %k0, %k1
-; AVX512-NEXT: kmovd %k1, %r9d
-; AVX512-NEXT: andb $1, %r9b
-; AVX512-NEXT: negb %r9b
+; AVX512-NEXT: kmovd %k1, %r10d
+; AVX512-NEXT: andb $1, %r10b
+; AVX512-NEXT: negb %r10b
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512-NEXT: kshiftrw $3, %k1, %k2
-; AVX512-NEXT: kmovd %k2, %r10d
-; AVX512-NEXT: andb $1, %r10b
-; AVX512-NEXT: negb %r10b
+; AVX512-NEXT: kmovd %k2, %r9d
+; AVX512-NEXT: andb $1, %r9b
+; AVX512-NEXT: negb %r9b
; AVX512-NEXT: kshiftrw $2, %k1, %k2
-; AVX512-NEXT: kmovd %k2, %r11d
-; AVX512-NEXT: andb $1, %r11b
-; AVX512-NEXT: negb %r11b
+; AVX512-NEXT: kmovd %k2, %ebp
+; AVX512-NEXT: andb $1, %bpl
+; AVX512-NEXT: negb %bpl
; AVX512-NEXT: kshiftrw $2, %k0, %k2
-; AVX512-NEXT: kmovd %k2, %ebx
-; AVX512-NEXT: andb $1, %bl
-; AVX512-NEXT: negb %bl
+; AVX512-NEXT: kmovd %k2, %edx
+; AVX512-NEXT: andb $1, %dl
+; AVX512-NEXT: negb %dl
; AVX512-NEXT: kshiftrw $1, %k0, %k2
; AVX512-NEXT: kmovd %k2, %esi
; AVX512-NEXT: andb $1, %sil
; AVX512-NEXT: negb %sil
; AVX512-NEXT: kshiftrw $1, %k1, %k2
-; AVX512-NEXT: kmovd %k2, %edx
-; AVX512-NEXT: andb $1, %dl
-; AVX512-NEXT: negb %dl
+; AVX512-NEXT: kmovd %k2, %ecx
+; AVX512-NEXT: andb $1, %cl
+; AVX512-NEXT: negb %cl
; AVX512-NEXT: kmovd %k1, %eax
; AVX512-NEXT: andb $1, %al
; AVX512-NEXT: negb %al
-; AVX512-NEXT: kmovd %k0, %ecx
-; AVX512-NEXT: andb $1, %cl
-; AVX512-NEXT: negb %cl
+; AVX512-NEXT: kmovd %k0, %ebx
+; AVX512-NEXT: andb $1, %bl
+; AVX512-NEXT: negb %bl
; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: imulb %cl
-; AVX512-NEXT: movl %eax, %r8d
+; AVX512-NEXT: imulb %bl
+; AVX512-NEXT: movl %eax, %r11d
; AVX512-NEXT: seto %al
-; AVX512-NEXT: movl %r8d, %ecx
-; AVX512-NEXT: andb $1, %cl
-; AVX512-NEXT: negb %cl
-; AVX512-NEXT: cmpb %r8b, %cl
-; AVX512-NEXT: setne %cl
-; AVX512-NEXT: orb %al, %cl
+; AVX512-NEXT: movl %r11d, %ebx
+; AVX512-NEXT: andb $1, %bl
+; AVX512-NEXT: negb %bl
+; AVX512-NEXT: cmpb %r11b, %bl
+; AVX512-NEXT: setne %bl
+; AVX512-NEXT: orb %al, %bl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: movw $-3, %ax
; AVX512-NEXT: kmovd %eax, %k0
; AVX512-NEXT: kandw %k0, %k1, %k1
-; AVX512-NEXT: movl %edx, %eax
+; AVX512-NEXT: movl %ecx, %eax
; AVX512-NEXT: imulb %sil
-; AVX512-NEXT: movl %eax, %edx
+; AVX512-NEXT: movl %eax, %r8d
; AVX512-NEXT: seto %al
-; AVX512-NEXT: movl %edx, %ecx
+; AVX512-NEXT: movl %r8d, %ecx
; AVX512-NEXT: andb $1, %cl
; AVX512-NEXT: negb %cl
-; AVX512-NEXT: cmpb %dl, %cl
+; AVX512-NEXT: cmpb %r8b, %cl
; AVX512-NEXT: setne %cl
; AVX512-NEXT: orb %al, %cl
; AVX512-NEXT: setne %al
@@ -3627,8 +3628,8 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: movw $-5, %ax
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: kandw %k1, %k2, %k2
-; AVX512-NEXT: movl %r11d, %eax
-; AVX512-NEXT: imulb %bl
+; AVX512-NEXT: movl %ebp, %eax
+; AVX512-NEXT: imulb %dl
; AVX512-NEXT: movl %eax, %esi
; AVX512-NEXT: seto %al
; AVX512-NEXT: movl %esi, %ecx
@@ -3643,25 +3644,26 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: korw %k3, %k2, %k2
; AVX512-NEXT: kshiftlw $13, %k2, %k2
; AVX512-NEXT: kshiftrw $13, %k2, %k2
-; AVX512-NEXT: movl %r10d, %eax
-; AVX512-NEXT: imulb %r9b
+; AVX512-NEXT: movl %r9d, %eax
+; AVX512-NEXT: imulb %r10b
; AVX512-NEXT: # kill: def $al killed $al def $eax
; AVX512-NEXT: seto %cl
-; AVX512-NEXT: movl %eax, %ebx
-; AVX512-NEXT: andb $1, %bl
-; AVX512-NEXT: negb %bl
-; AVX512-NEXT: cmpb %al, %bl
-; AVX512-NEXT: setne %bl
-; AVX512-NEXT: orb %cl, %bl
+; AVX512-NEXT: movl %eax, %edx
+; AVX512-NEXT: andb $1, %dl
+; AVX512-NEXT: negb %dl
+; AVX512-NEXT: cmpb %al, %dl
+; AVX512-NEXT: setne %dl
+; AVX512-NEXT: orb %cl, %dl
; AVX512-NEXT: setne %cl
; AVX512-NEXT: kmovd %ecx, %k3
; AVX512-NEXT: kshiftlw $3, %k3, %k3
; AVX512-NEXT: korw %k3, %k2, %k2
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kmovd %r8d, %k2
+; AVX512-NEXT: andl $1, %r11d
+; AVX512-NEXT: kmovw %r11d, %k2
; AVX512-NEXT: kandw %k0, %k2, %k0
-; AVX512-NEXT: kmovd %edx, %k2
+; AVX512-NEXT: kmovd %r8d, %k2
; AVX512-NEXT: kshiftlw $15, %k2, %k2
; AVX512-NEXT: kshiftrw $14, %k2, %k2
; AVX512-NEXT: korw %k2, %k0, %k0
@@ -3680,6 +3682,7 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: retq
%t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index d4d458560fa6..f9dbaccfbd72 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -1140,6 +1140,8 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: kxorw %k2, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index c34653be4a02..5954f1195741 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1180,7 +1180,9 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: kandnw %k0, %k1, %k2
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kmovd %k1, %eax
+; AVX512-NEXT: kshiftlw $12, %k1, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
%t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index cc25fd5bec78..4766fe90a3d4 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -3188,52 +3188,53 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
;
; AVX512-LABEL: umulo_v4i1:
; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: kshiftrw $3, %k0, %k1
-; AVX512-NEXT: kmovd %k1, %r9d
-; AVX512-NEXT: andb $1, %r9b
+; AVX512-NEXT: kmovd %k1, %r8d
+; AVX512-NEXT: andb $1, %r8b
; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512-NEXT: kshiftrw $3, %k1, %k2
+; AVX512-NEXT: kmovd %k2, %r9d
+; AVX512-NEXT: andb $1, %r9b
+; AVX512-NEXT: kshiftrw $2, %k0, %k2
; AVX512-NEXT: kmovd %k2, %r10d
; AVX512-NEXT: andb $1, %r10b
-; AVX512-NEXT: kshiftrw $2, %k0, %k2
+; AVX512-NEXT: kshiftrw $2, %k1, %k2
; AVX512-NEXT: kmovd %k2, %r11d
; AVX512-NEXT: andb $1, %r11b
-; AVX512-NEXT: kshiftrw $2, %k1, %k2
-; AVX512-NEXT: kmovd %k2, %ebx
-; AVX512-NEXT: andb $1, %bl
; AVX512-NEXT: kshiftrw $1, %k0, %k2
-; AVX512-NEXT: kmovd %k2, %edx
-; AVX512-NEXT: andb $1, %dl
+; AVX512-NEXT: kmovd %k2, %ecx
+; AVX512-NEXT: andb $1, %cl
; AVX512-NEXT: kshiftrw $1, %k1, %k2
; AVX512-NEXT: kmovd %k2, %esi
; AVX512-NEXT: andb $1, %sil
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: andb $1, %al
-; AVX512-NEXT: kmovd %k1, %ecx
-; AVX512-NEXT: andb $1, %cl
+; AVX512-NEXT: kmovd %k1, %edx
+; AVX512-NEXT: andb $1, %dl
; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: mulb %cl
-; AVX512-NEXT: movl %eax, %r8d
+; AVX512-NEXT: mulb %dl
+; AVX512-NEXT: movl %eax, %edx
; AVX512-NEXT: seto %al
-; AVX512-NEXT: testb $-2, %r8b
-; AVX512-NEXT: setne %cl
-; AVX512-NEXT: orb %al, %cl
+; AVX512-NEXT: testb $-2, %dl
+; AVX512-NEXT: setne %bl
+; AVX512-NEXT: orb %al, %bl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: movw $-3, %ax
; AVX512-NEXT: kmovd %eax, %k0
; AVX512-NEXT: kandw %k0, %k1, %k1
-; AVX512-NEXT: movl %edx, %eax
+; AVX512-NEXT: movl %ecx, %eax
; AVX512-NEXT: mulb %sil
-; AVX512-NEXT: movl %eax, %edx
+; AVX512-NEXT: movl %eax, %ebp
; AVX512-NEXT: seto %al
-; AVX512-NEXT: testb $-2, %dl
-; AVX512-NEXT: setne %cl
-; AVX512-NEXT: orb %al, %cl
+; AVX512-NEXT: testb $-2, %bpl
+; AVX512-NEXT: setne %bl
+; AVX512-NEXT: orb %al, %bl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k2
; AVX512-NEXT: kshiftlw $15, %k2, %k2
@@ -3242,35 +3243,36 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: movw $-5, %ax
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: kandw %k1, %k2, %k2
-; AVX512-NEXT: movl %r11d, %eax
-; AVX512-NEXT: mulb %bl
+; AVX512-NEXT: movl %r10d, %eax
+; AVX512-NEXT: mulb %r11b
; AVX512-NEXT: movl %eax, %esi
; AVX512-NEXT: seto %al
; AVX512-NEXT: testb $-2, %sil
-; AVX512-NEXT: setne %cl
-; AVX512-NEXT: orb %al, %cl
+; AVX512-NEXT: setne %bl
+; AVX512-NEXT: orb %al, %bl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k3
; AVX512-NEXT: kshiftlw $2, %k3, %k3
; AVX512-NEXT: korw %k3, %k2, %k2
; AVX512-NEXT: kshiftlw $13, %k2, %k2
; AVX512-NEXT: kshiftrw $13, %k2, %k2
-; AVX512-NEXT: movl %r9d, %eax
-; AVX512-NEXT: mulb %r10b
+; AVX512-NEXT: movl %r8d, %eax
+; AVX512-NEXT: mulb %r9b
; AVX512-NEXT: # kill: def $al killed $al def $eax
-; AVX512-NEXT: seto %cl
+; AVX512-NEXT: seto %bl
; AVX512-NEXT: testb $-2, %al
-; AVX512-NEXT: setne %bl
-; AVX512-NEXT: orb %cl, %bl
+; AVX512-NEXT: setne %cl
+; AVX512-NEXT: orb %bl, %cl
; AVX512-NEXT: setne %cl
; AVX512-NEXT: kmovd %ecx, %k3
; AVX512-NEXT: kshiftlw $3, %k3, %k3
; AVX512-NEXT: korw %k3, %k2, %k2
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kmovd %r8d, %k2
+; AVX512-NEXT: andl $1, %edx
+; AVX512-NEXT: kmovw %edx, %k2
; AVX512-NEXT: kandw %k0, %k2, %k0
-; AVX512-NEXT: kmovd %edx, %k2
+; AVX512-NEXT: kmovd %ebp, %k2
; AVX512-NEXT: kshiftlw $15, %k2, %k2
; AVX512-NEXT: kshiftrw $14, %k2, %k2
; AVX512-NEXT: korw %k2, %k0, %k0
@@ -3289,6 +3291,7 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: retq
%t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 76c3e5ad3290..afb0f6cce29c 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -1227,7 +1227,9 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kmovd %k1, %eax
+; AVX512-NEXT: kshiftlw $12, %k1, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
%t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
More information about the llvm-commits
mailing list