[llvm] 377f27b - [X86] `DAGTypeLegalizer::ModifyToType()`: when widening w/ zeros, insert into undef and `and`-mask the padding away

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 24 10:27:30 PDT 2022


Author: Roman Lebedev
Date: 2022-10-24T20:27:02+03:00
New Revision: 377f27be87f92e82902be277bb02c401475c9aa1

URL: https://github.com/llvm/llvm-project/commit/377f27be87f92e82902be277bb02c401475c9aa1
DIFF: https://github.com/llvm/llvm-project/commit/377f27be87f92e82902be277bb02c401475c9aa1.diff

LOG: [X86] `DAGTypeLegalizer::ModifyToType()`: when widening w/ zeros, insert into undef and `and`-mask the padding away

We can expect that the sequence of inserting-of-extracts-into-undef
will be successfully lowered back into widening of the source vector,
but it seems that at least for X86 mask vectors, we have a really hard time
recovering from inserting-into-zero.

I've looked into alternative fix injection points, and they are much more
involved, by the time of `LowerBUILD_VECTORvXi1()`/`LowerINSERT_VECTOR_ELT()`
the constants might be obscured, so it does not seem like we can easily
deal with this by lowering into bit math later on,
some other pieces are missing.

Instead, it seems like just clearing the padding away via an `AND`-mask
is at least not a worse choice. Why create a problem where there wasn't one.
Though yes, it is possible that there are cases where constants originate
from the source IR, so some other fix may still be needed.

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D136046

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/test/CodeGen/X86/masked_store.ll
    llvm/test/CodeGen/X86/pr45563-2.ll
    llvm/test/CodeGen/X86/pr45833.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 89e9659137c9..4db646c09bbd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7036,7 +7036,7 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
   unsigned InNumElts = InEC.getFixedValue();
   unsigned WidenNumElts = WidenEC.getFixedValue();
 
-  // Fall back to extract and build.
+  // Fall back to extract and build (+ mask, if padding with zeros).
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   EVT EltVT = NVT.getVectorElementType();
   unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
@@ -7045,9 +7045,21 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
     Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
                            DAG.getVectorIdxConstant(Idx, dl));
 
-  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
-    DAG.getUNDEF(EltVT);
-  for ( ; Idx < WidenNumElts; ++Idx)
-    Ops[Idx] = FillVal;
-  return DAG.getBuildVector(NVT, dl, Ops);
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+
+  SDValue Widened = DAG.getBuildVector(NVT, dl, Ops);
+  if (!FillWithZeroes)
+    return Widened;
+
+  assert(NVT.isInteger() &&
+         "We expect to never want to FillWithZeroes for non-integral types.");
+
+  SmallVector<SDValue, 16> MaskOps;
+  MaskOps.append(MinNumElts, DAG.getAllOnesConstant(dl, EltVT));
+  MaskOps.append(WidenNumElts - MinNumElts, DAG.getConstant(0, dl, EltVT));
+
+  return DAG.getNode(ISD::AND, dl, NVT, Widened,
+                     DAG.getBuildVector(NVT, dl, MaskOps));
 }

diff  --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 3d714f31ba72..f5e520e9b48b 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -2488,11 +2488,8 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
 ; KNL_64-LABEL: test30:
 ; KNL_64:       # %bb.0:
 ; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
-; KNL_64-NEXT:    movw $-3, %ax
-; KNL_64-NEXT:    kmovw %eax, %k0
 ; KNL_64-NEXT:    andl $1, %edi
-; KNL_64-NEXT:    kmovw %edi, %k1
-; KNL_64-NEXT:    kandw %k0, %k1, %k0
+; KNL_64-NEXT:    kmovw %edi, %k0
 ; KNL_64-NEXT:    kmovw %esi, %k1
 ; KNL_64-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_64-NEXT:    kshiftrw $14, %k1, %k1
@@ -2504,6 +2501,9 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
 ; KNL_64-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_64-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL_64-NEXT:    korw %k1, %k0, %k0
+; KNL_64-NEXT:    movb $7, %al
+; KNL_64-NEXT:    kmovw %eax, %k1
+; KNL_64-NEXT:    kandw %k1, %k0, %k0
 ; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
 ; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
@@ -2517,12 +2517,9 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
 ; KNL_32-LABEL: test30:
 ; KNL_32:       # %bb.0:
 ; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_32-NEXT:    movw $-3, %ax
-; KNL_32-NEXT:    kmovw %eax, %k0
 ; KNL_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    andl $1, %eax
-; KNL_32-NEXT:    kmovw %eax, %k1
-; KNL_32-NEXT:    kandw %k0, %k1, %k0
+; KNL_32-NEXT:    kmovw %eax, %k0
 ; KNL_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    kmovw %eax, %k1
 ; KNL_32-NEXT:    kshiftlw $15, %k1, %k1
@@ -2536,6 +2533,9 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
 ; KNL_32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_32-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL_32-NEXT:    korw %k1, %k0, %k0
+; KNL_32-NEXT:    movb $7, %al
+; KNL_32-NEXT:    kmovw %eax, %k1
+; KNL_32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
 ; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
 ; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
@@ -2547,23 +2547,23 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
 ;
 ; SKX-LABEL: test30:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    movb $-3, %al
-; SKX-NEXT:    kmovw %eax, %k0
+; SKX-NEXT:    kmovw %esi, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
 ; SKX-NEXT:    kmovw %edi, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $7, %k1, %k1
-; SKX-NEXT:    kandw %k0, %k1, %k0
-; SKX-NEXT:    kmovw %esi, %k1
-; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $6, %k1, %k1
-; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    korw %k0, %k1, %k0
 ; SKX-NEXT:    movb $-5, %al
 ; SKX-NEXT:    kmovw %eax, %k1
 ; SKX-NEXT:    kandw %k1, %k0, %k0
 ; SKX-NEXT:    kmovw %edx, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
-; SKX-NEXT:    korw %k1, %k0, %k1
+; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    movb $7, %al
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kandw %k1, %k0, %k1
 ; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
@@ -2574,18 +2574,15 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
 ;
 ; SKX_32-LABEL: test30:
 ; SKX_32:       # %bb.0:
-; SKX_32-NEXT:    movb $-3, %al
+; SKX_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; SKX_32-NEXT:    kmovw %eax, %k0
+; SKX_32-NEXT:    kshiftlb $7, %k0, %k0
+; SKX_32-NEXT:    kshiftrb $6, %k0, %k0
 ; SKX_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; SKX_32-NEXT:    kmovw %eax, %k1
 ; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX_32-NEXT:    kshiftrb $7, %k1, %k1
-; SKX_32-NEXT:    kandw %k0, %k1, %k0
-; SKX_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    kmovw %eax, %k1
-; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
-; SKX_32-NEXT:    kshiftrb $6, %k1, %k1
-; SKX_32-NEXT:    korw %k1, %k0, %k0
+; SKX_32-NEXT:    korw %k0, %k1, %k0
 ; SKX_32-NEXT:    movb $-5, %al
 ; SKX_32-NEXT:    kmovw %eax, %k1
 ; SKX_32-NEXT:    kandw %k1, %k0, %k0
@@ -2593,7 +2590,10 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
 ; SKX_32-NEXT:    kmovw %eax, %k1
 ; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX_32-NEXT:    kshiftrb $5, %k1, %k1
-; SKX_32-NEXT:    korw %k1, %k0, %k1
+; SKX_32-NEXT:    korw %k1, %k0, %k0
+; SKX_32-NEXT:    movb $7, %al
+; SKX_32-NEXT:    kmovw %eax, %k1
+; SKX_32-NEXT:    kandw %k1, %k0, %k1
 ; SKX_32-NEXT:    vpslld $2, %xmm1, %xmm1
 ; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; SKX_32-NEXT:    vpgatherdd (,%xmm0), %xmm2 {%k1}
@@ -2612,11 +2612,8 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
 ; KNL_64-LABEL: test30b:
 ; KNL_64:       # %bb.0:
 ; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
-; KNL_64-NEXT:    movw $-3, %ax
-; KNL_64-NEXT:    kmovw %eax, %k0
 ; KNL_64-NEXT:    andl $1, %edi
-; KNL_64-NEXT:    kmovw %edi, %k1
-; KNL_64-NEXT:    kandw %k0, %k1, %k0
+; KNL_64-NEXT:    kmovw %edi, %k0
 ; KNL_64-NEXT:    kmovw %esi, %k1
 ; KNL_64-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_64-NEXT:    kshiftrw $14, %k1, %k1
@@ -2628,6 +2625,9 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
 ; KNL_64-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_64-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL_64-NEXT:    korw %k1, %k0, %k0
+; KNL_64-NEXT:    movb $7, %al
+; KNL_64-NEXT:    kmovw %eax, %k1
+; KNL_64-NEXT:    kandw %k1, %k0, %k0
 ; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
 ; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
@@ -2640,12 +2640,9 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
 ; KNL_32-LABEL: test30b:
 ; KNL_32:       # %bb.0:
 ; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_32-NEXT:    movw $-3, %ax
-; KNL_32-NEXT:    kmovw %eax, %k0
 ; KNL_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    andl $1, %eax
-; KNL_32-NEXT:    kmovw %eax, %k1
-; KNL_32-NEXT:    kandw %k0, %k1, %k0
+; KNL_32-NEXT:    kmovw %eax, %k0
 ; KNL_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    kmovw %eax, %k1
 ; KNL_32-NEXT:    kshiftlw $15, %k1, %k1
@@ -2659,6 +2656,9 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
 ; KNL_32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_32-NEXT:    kshiftrw $13, %k1, %k1
 ; KNL_32-NEXT:    korw %k1, %k0, %k0
+; KNL_32-NEXT:    movb $7, %al
+; KNL_32-NEXT:    kmovw %eax, %k1
+; KNL_32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
 ; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
 ; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
@@ -2669,23 +2669,23 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
 ;
 ; SKX-LABEL: test30b:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    movb $-3, %al
-; SKX-NEXT:    kmovw %eax, %k0
+; SKX-NEXT:    kmovw %esi, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $6, %k0, %k0
 ; SKX-NEXT:    kmovw %edi, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $7, %k1, %k1
-; SKX-NEXT:    kandw %k0, %k1, %k0
-; SKX-NEXT:    kmovw %esi, %k1
-; SKX-NEXT:    kshiftlb $7, %k1, %k1
-; SKX-NEXT:    kshiftrb $6, %k1, %k1
-; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    korw %k0, %k1, %k0
 ; SKX-NEXT:    movb $-5, %al
 ; SKX-NEXT:    kmovw %eax, %k1
 ; SKX-NEXT:    kandw %k1, %k0, %k0
 ; SKX-NEXT:    kmovw %edx, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
-; SKX-NEXT:    korw %k1, %k0, %k1
+; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    movb $7, %al
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kandw %k1, %k0, %k1
 ; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
@@ -2695,18 +2695,15 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
 ;
 ; SKX_32-LABEL: test30b:
 ; SKX_32:       # %bb.0:
-; SKX_32-NEXT:    movb $-3, %al
+; SKX_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; SKX_32-NEXT:    kmovw %eax, %k0
+; SKX_32-NEXT:    kshiftlb $7, %k0, %k0
+; SKX_32-NEXT:    kshiftrb $6, %k0, %k0
 ; SKX_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; SKX_32-NEXT:    kmovw %eax, %k1
 ; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX_32-NEXT:    kshiftrb $7, %k1, %k1
-; SKX_32-NEXT:    kandw %k0, %k1, %k0
-; SKX_32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    kmovw %eax, %k1
-; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
-; SKX_32-NEXT:    kshiftrb $6, %k1, %k1
-; SKX_32-NEXT:    korw %k1, %k0, %k0
+; SKX_32-NEXT:    korw %k0, %k1, %k0
 ; SKX_32-NEXT:    movb $-5, %al
 ; SKX_32-NEXT:    kmovw %eax, %k1
 ; SKX_32-NEXT:    kandw %k1, %k0, %k0
@@ -2714,7 +2711,10 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
 ; SKX_32-NEXT:    kmovw %eax, %k1
 ; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX_32-NEXT:    kshiftrb $5, %k1, %k1
-; SKX_32-NEXT:    korw %k1, %k0, %k1
+; SKX_32-NEXT:    korw %k1, %k0, %k0
+; SKX_32-NEXT:    movb $7, %al
+; SKX_32-NEXT:    kmovw %eax, %k1
+; SKX_32-NEXT:    kandw %k1, %k0, %k1
 ; SKX_32-NEXT:    vpslld $2, %xmm1, %xmm1
 ; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; SKX_32-NEXT:    vpscatterdd %xmm2, (,%xmm0) {%k1}

diff  --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index c6789dec3530..5bf48ce1bfc8 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -5324,11 +5324,8 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
 ; AVX512F-LABEL: widen_masked_store:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    movw $-3, %ax
-; AVX512F-NEXT:    kmovw %eax, %k0
 ; AVX512F-NEXT:    andl $1, %esi
-; AVX512F-NEXT:    kmovw %esi, %k1
-; AVX512F-NEXT:    kandw %k0, %k1, %k0
+; AVX512F-NEXT:    kmovw %esi, %k0
 ; AVX512F-NEXT:    kmovw %edx, %k1
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
@@ -5340,6 +5337,9 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
 ; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
 ; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movb $7, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
@@ -5348,33 +5348,30 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
 ;
 ; AVX512VLDQ-LABEL: widen_masked_store:
 ; AVX512VLDQ:       ## %bb.0:
-; AVX512VLDQ-NEXT:    movb $-3, %al
-; AVX512VLDQ-NEXT:    kmovw %eax, %k0
+; AVX512VLDQ-NEXT:    kmovw %edx, %k0
+; AVX512VLDQ-NEXT:    kshiftlb $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k0
 ; AVX512VLDQ-NEXT:    kmovw %esi, %k1
 ; AVX512VLDQ-NEXT:    kshiftlb $7, %k1, %k1
 ; AVX512VLDQ-NEXT:    kshiftrb $7, %k1, %k1
-; AVX512VLDQ-NEXT:    kandw %k0, %k1, %k0
-; AVX512VLDQ-NEXT:    kmovw %edx, %k1
-; AVX512VLDQ-NEXT:    kshiftlb $7, %k1, %k1
-; AVX512VLDQ-NEXT:    kshiftrb $6, %k1, %k1
-; AVX512VLDQ-NEXT:    korw %k1, %k0, %k0
+; AVX512VLDQ-NEXT:    korw %k0, %k1, %k0
 ; AVX512VLDQ-NEXT:    movb $-5, %al
 ; AVX512VLDQ-NEXT:    kmovw %eax, %k1
 ; AVX512VLDQ-NEXT:    kandw %k1, %k0, %k0
 ; AVX512VLDQ-NEXT:    kmovw %ecx, %k1
 ; AVX512VLDQ-NEXT:    kshiftlb $7, %k1, %k1
 ; AVX512VLDQ-NEXT:    kshiftrb $5, %k1, %k1
-; AVX512VLDQ-NEXT:    korw %k1, %k0, %k1
+; AVX512VLDQ-NEXT:    korw %k1, %k0, %k0
+; AVX512VLDQ-NEXT:    movb $7, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    kandw %k1, %k0, %k1
 ; AVX512VLDQ-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
 ; AVX512VLDQ-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: widen_masked_store:
 ; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    movw $-3, %ax
-; AVX512VLBW-NEXT:    kmovd %eax, %k0
 ; AVX512VLBW-NEXT:    andl $1, %esi
-; AVX512VLBW-NEXT:    kmovw %esi, %k1
-; AVX512VLBW-NEXT:    kandw %k0, %k1, %k0
+; AVX512VLBW-NEXT:    kmovw %esi, %k0
 ; AVX512VLBW-NEXT:    kmovd %edx, %k1
 ; AVX512VLBW-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512VLBW-NEXT:    kshiftrw $14, %k1, %k1
@@ -5385,29 +5382,32 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
 ; AVX512VLBW-NEXT:    kmovd %ecx, %k1
 ; AVX512VLBW-NEXT:    kshiftlw $15, %k1, %k1
 ; AVX512VLBW-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512VLBW-NEXT:    korw %k1, %k0, %k1
+; AVX512VLBW-NEXT:    korw %k1, %k0, %k0
+; AVX512VLBW-NEXT:    movb $7, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    kandw %k1, %k0, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
 ; X86-AVX512-LABEL: widen_masked_store:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movb $-3, %al
-; X86-AVX512-NEXT:    kmovd %eax, %k0
+; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k0
+; X86-AVX512-NEXT:    kshiftlb $7, %k0, %k0
+; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k0
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
 ; X86-AVX512-NEXT:    kshiftlb $7, %k1, %k1
 ; X86-AVX512-NEXT:    kshiftrb $7, %k1, %k1
-; X86-AVX512-NEXT:    kandw %k0, %k1, %k0
-; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
-; X86-AVX512-NEXT:    kshiftlb $7, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrb $6, %k1, %k1
-; X86-AVX512-NEXT:    korw %k1, %k0, %k0
+; X86-AVX512-NEXT:    korw %k0, %k1, %k0
 ; X86-AVX512-NEXT:    movb $-5, %al
 ; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    kandw %k1, %k0, %k0
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
 ; X86-AVX512-NEXT:    kshiftlb $7, %k1, %k1
 ; X86-AVX512-NEXT:    kshiftrb $5, %k1, %k1
-; X86-AVX512-NEXT:    korw %k1, %k0, %k1
+; X86-AVX512-NEXT:    korw %k1, %k0, %k0
+; X86-AVX512-NEXT:    movb $7, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
+; X86-AVX512-NEXT:    kandw %k1, %k0, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqa32 %xmm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
@@ -6214,17 +6214,14 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
 ; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm0
 ; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpgtd 64(%rdi), %zmm2, %k0
-; AVX512F-NEXT:    movw $85, %ax
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    kandw %k1, %k0, %k0
-; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
 ; AVX512F-NEXT:    movw $21845, %ax ## imm = 0x5555
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpcmpgtd (%rdi), %zmm2, %k1 {%k1}
+; AVX512F-NEXT:    movw $85, %ax
 ; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vpcmpgtd (%rdi), %zmm2, %k2 {%k2}
-; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdx) {%k2}
-; AVX512F-NEXT:    vmovdqu32 %zmm1, 64(%rdx) {%k1}
+; AVX512F-NEXT:    vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2}
+; AVX512F-NEXT:    vmovdqu32 %zmm1, 64(%rdx) {%k2}
+; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdx) {%k1}
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -6233,338 +6230,49 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
 ; AVX512VLDQ-NEXT:    vmovdqa64 (%rsi), %zmm0
 ; AVX512VLDQ-NEXT:    vmovdqa64 64(%rsi), %zmm1
 ; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLDQ-NEXT:    vpcmpgtd 64(%rdi), %zmm2, %k0
-; AVX512VLDQ-NEXT:    movw $85, %ax
-; AVX512VLDQ-NEXT:    kmovw %eax, %k1
-; AVX512VLDQ-NEXT:    kandb %k1, %k0, %k0
-; AVX512VLDQ-NEXT:    kmovb %k0, %k1
 ; AVX512VLDQ-NEXT:    movw $21845, %ax ## imm = 0x5555
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vpcmpgtd (%rdi), %zmm2, %k1 {%k1}
+; AVX512VLDQ-NEXT:    movw $85, %ax
 ; AVX512VLDQ-NEXT:    kmovw %eax, %k2
-; AVX512VLDQ-NEXT:    vpcmpgtd (%rdi), %zmm2, %k2 {%k2}
-; AVX512VLDQ-NEXT:    vmovdqu32 %zmm0, (%rdx) {%k2}
-; AVX512VLDQ-NEXT:    vmovdqu32 %zmm1, 64(%rdx) {%k1}
+; AVX512VLDQ-NEXT:    vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2}
+; AVX512VLDQ-NEXT:    vmovdqu32 %zmm1, 64(%rdx) {%k2}
+; AVX512VLDQ-NEXT:    vmovdqu32 %zmm0, (%rdx) {%k1}
 ; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
 ; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
-; AVX512VLBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512VLBW-NEXT:    vpcmpgtd 64(%rdi), %zmm0, %k0
-; AVX512VLBW-NEXT:    kunpckwd %k1, %k0, %k0
-; AVX512VLBW-NEXT:    movl $5592405, %eax ## imm = 0x555555
-; AVX512VLBW-NEXT:    kmovd %eax, %k2
-; AVX512VLBW-NEXT:    kandd %k2, %k0, %k0
-; AVX512VLBW-NEXT:    kshiftrd $21, %k0, %k6
-; AVX512VLBW-NEXT:    kshiftrd $20, %k0, %k5
-; AVX512VLBW-NEXT:    kshiftrd $19, %k0, %k4
-; AVX512VLBW-NEXT:    kshiftrd $18, %k0, %k3
-; AVX512VLBW-NEXT:    kshiftrd $16, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftrd $17, %k0, %k7
-; AVX512VLBW-NEXT:    kshiftlw $15, %k7, %k7
-; AVX512VLBW-NEXT:    kshiftrw $14, %k7, %k7
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $15, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k7, %k2, %k7
-; AVX512VLBW-NEXT:    movw $-5, %ax
-; AVX512VLBW-NEXT:    kmovd %eax, %k1
-; AVX512VLBW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512VLBW-NEXT:    kandw %k1, %k7, %k7
-; AVX512VLBW-NEXT:    kshiftlw $15, %k3, %k3
-; AVX512VLBW-NEXT:    kshiftrw $13, %k3, %k3
-; AVX512VLBW-NEXT:    korw %k3, %k7, %k7
-; AVX512VLBW-NEXT:    movw $-9, %ax
-; AVX512VLBW-NEXT:    kmovd %eax, %k3
-; AVX512VLBW-NEXT:    kandw %k3, %k7, %k7
-; AVX512VLBW-NEXT:    kshiftlw $15, %k4, %k4
-; AVX512VLBW-NEXT:    kshiftrw $12, %k4, %k4
-; AVX512VLBW-NEXT:    korw %k4, %k7, %k7
-; AVX512VLBW-NEXT:    movw $-17, %ax
-; AVX512VLBW-NEXT:    kmovd %eax, %k4
-; AVX512VLBW-NEXT:    kandw %k4, %k7, %k7
-; AVX512VLBW-NEXT:    kshiftlw $15, %k5, %k5
-; AVX512VLBW-NEXT:    kshiftrw $11, %k5, %k5
-; AVX512VLBW-NEXT:    korw %k5, %k7, %k7
-; AVX512VLBW-NEXT:    movw $-33, %ax
-; AVX512VLBW-NEXT:    kmovd %eax, %k5
-; AVX512VLBW-NEXT:    kandw %k5, %k7, %k7
-; AVX512VLBW-NEXT:    kshiftlw $15, %k6, %k6
-; AVX512VLBW-NEXT:    kshiftrw $10, %k6, %k6
-; AVX512VLBW-NEXT:    korw %k6, %k7, %k7
-; AVX512VLBW-NEXT:    movw $-65, %ax
-; AVX512VLBW-NEXT:    kmovd %eax, %k6
-; AVX512VLBW-NEXT:    kandw %k6, %k7, %k7
-; AVX512VLBW-NEXT:    kshiftrd $22, %k0, %k1
-; AVX512VLBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrw $9, %k1, %k1
-; AVX512VLBW-NEXT:    korw %k1, %k7, %k1
-; AVX512VLBW-NEXT:    movw $-129, %ax
-; AVX512VLBW-NEXT:    kmovd %eax, %k7
-; AVX512VLBW-NEXT:    kandw %k7, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $23, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $8, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 (%rsi), %zmm0
 ; AVX512VLBW-NEXT:    vmovdqa64 64(%rsi), %zmm1
-; AVX512VLBW-NEXT:    vmovdqu32 %zmm1, 64(%rdx) {%k1}
-; AVX512VLBW-NEXT:    kshiftrd $1, %k0, %k1
-; AVX512VLBW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512VLBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $15, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k1, %k2, %k1
-; AVX512VLBW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; AVX512VLBW-NEXT:    kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $2, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $13, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kandw %k3, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $3, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $12, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kandw %k4, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $4, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $11, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kandw %k5, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $5, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $10, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kandw %k6, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $6, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $9, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kandw %k7, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $7, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $8, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    movw $-257, %ax ## imm = 0xFEFF
-; AVX512VLBW-NEXT:    kmovd %eax, %k2
-; AVX512VLBW-NEXT:    kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $8, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $7, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    movw $-513, %ax ## imm = 0xFDFF
-; AVX512VLBW-NEXT:    kmovd %eax, %k2
-; AVX512VLBW-NEXT:    kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $9, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $6, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    movw $-1025, %ax ## imm = 0xFBFF
-; AVX512VLBW-NEXT:    kmovd %eax, %k2
-; AVX512VLBW-NEXT:    kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $10, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $5, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
-; AVX512VLBW-NEXT:    kmovd %eax, %k2
-; AVX512VLBW-NEXT:    kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $11, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $4, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    movw $-4097, %ax ## imm = 0xEFFF
-; AVX512VLBW-NEXT:    kmovd %eax, %k2
-; AVX512VLBW-NEXT:    kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $12, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $3, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    movw $-8193, %ax ## imm = 0xDFFF
-; AVX512VLBW-NEXT:    kmovd %eax, %k2
-; AVX512VLBW-NEXT:    kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $13, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT:    kshiftrw $2, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    movw $-16385, %ax ## imm = 0xBFFF
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    movw $21845, %ax ## imm = 0x5555
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vpcmpgtd (%rdi), %zmm2, %k1 {%k1}
+; AVX512VLBW-NEXT:    movw $85, %ax
 ; AVX512VLBW-NEXT:    kmovd %eax, %k2
-; AVX512VLBW-NEXT:    kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $14, %k0, %k2
-; AVX512VLBW-NEXT:    kshiftlw $14, %k2, %k2
-; AVX512VLBW-NEXT:    korw %k2, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrd $15, %k0, %k0
-; AVX512VLBW-NEXT:    kshiftlw $1, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftrw $1, %k1, %k1
-; AVX512VLBW-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512VLBW-NEXT:    korw %k0, %k1, %k1
+; AVX512VLBW-NEXT:    vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2}
+; AVX512VLBW-NEXT:    vmovdqu32 %zmm1, 64(%rdx) {%k2}
 ; AVX512VLBW-NEXT:    vmovdqu32 %zmm0, (%rdx) {%k1}
 ; AVX512VLBW-NEXT:    vzeroupper
 ; AVX512VLBW-NEXT:    retq
 ;
 ; X86-AVX512-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    pushl %eax
-; X86-AVX512-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; X86-AVX512-NEXT:    vpcmpgtd (%eax), %zmm0, %k1
-; X86-AVX512-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
-; X86-AVX512-NEXT:    vpcmpgtd 64(%eax), %zmm0, %k0
-; X86-AVX512-NEXT:    kunpckwd %k1, %k0, %k0
-; X86-AVX512-NEXT:    movl $5592405, %eax ## imm = 0x555555
-; X86-AVX512-NEXT:    kmovd %eax, %k2
-; X86-AVX512-NEXT:    kandd %k2, %k0, %k0
-; X86-AVX512-NEXT:    kshiftrd $21, %k0, %k6
-; X86-AVX512-NEXT:    kshiftrd $20, %k0, %k5
-; X86-AVX512-NEXT:    kshiftrd $19, %k0, %k4
-; X86-AVX512-NEXT:    kshiftrd $18, %k0, %k3
-; X86-AVX512-NEXT:    kshiftrd $16, %k0, %k2
-; X86-AVX512-NEXT:    kshiftrd $17, %k0, %k7
-; X86-AVX512-NEXT:    kshiftlw $15, %k7, %k7
-; X86-AVX512-NEXT:    kshiftrw $14, %k7, %k7
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $15, %k2, %k2
-; X86-AVX512-NEXT:    korw %k7, %k2, %k7
-; X86-AVX512-NEXT:    movw $-5, %ax
-; X86-AVX512-NEXT:    kmovd %eax, %k1
-; X86-AVX512-NEXT:    kmovw %k1, (%esp) ## 2-byte Spill
-; X86-AVX512-NEXT:    kandw %k1, %k7, %k7
-; X86-AVX512-NEXT:    kshiftlw $15, %k3, %k3
-; X86-AVX512-NEXT:    kshiftrw $13, %k3, %k3
-; X86-AVX512-NEXT:    korw %k3, %k7, %k7
-; X86-AVX512-NEXT:    movw $-9, %ax
-; X86-AVX512-NEXT:    kmovd %eax, %k3
-; X86-AVX512-NEXT:    kandw %k3, %k7, %k7
-; X86-AVX512-NEXT:    kshiftlw $15, %k4, %k4
-; X86-AVX512-NEXT:    kshiftrw $12, %k4, %k4
-; X86-AVX512-NEXT:    korw %k4, %k7, %k7
-; X86-AVX512-NEXT:    movw $-17, %ax
-; X86-AVX512-NEXT:    kmovd %eax, %k4
-; X86-AVX512-NEXT:    kandw %k4, %k7, %k7
-; X86-AVX512-NEXT:    kshiftlw $15, %k5, %k5
-; X86-AVX512-NEXT:    kshiftrw $11, %k5, %k5
-; X86-AVX512-NEXT:    korw %k5, %k7, %k7
-; X86-AVX512-NEXT:    movw $-33, %ax
-; X86-AVX512-NEXT:    kmovd %eax, %k5
-; X86-AVX512-NEXT:    kandw %k5, %k7, %k7
-; X86-AVX512-NEXT:    kshiftlw $15, %k6, %k6
-; X86-AVX512-NEXT:    kshiftrw $10, %k6, %k6
-; X86-AVX512-NEXT:    korw %k6, %k7, %k7
-; X86-AVX512-NEXT:    movw $-65, %ax
-; X86-AVX512-NEXT:    kmovd %eax, %k6
-; X86-AVX512-NEXT:    kandw %k6, %k7, %k7
-; X86-AVX512-NEXT:    kshiftrd $22, %k0, %k1
-; X86-AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrw $9, %k1, %k1
-; X86-AVX512-NEXT:    korw %k1, %k7, %k1
-; X86-AVX512-NEXT:    movw $-129, %ax
-; X86-AVX512-NEXT:    kmovd %eax, %k7
-; X86-AVX512-NEXT:    kandw %k7, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $23, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $8, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    vmovdqa64 (%ecx), %zmm0
-; X86-AVX512-NEXT:    vmovdqa64 64(%ecx), %zmm1
-; X86-AVX512-NEXT:    vmovdqu32 %zmm1, 64(%eax) {%k1}
-; X86-AVX512-NEXT:    kshiftrd $1, %k0, %k1
-; X86-AVX512-NEXT:    kshiftlw $15, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrw $14, %k1, %k1
-; X86-AVX512-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $15, %k2, %k2
-; X86-AVX512-NEXT:    korw %k1, %k2, %k1
-; X86-AVX512-NEXT:    kmovw (%esp), %k2 ## 2-byte Reload
-; X86-AVX512-NEXT:    kandw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $2, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $13, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kandw %k3, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $3, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $12, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kandw %k4, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $4, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $11, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kandw %k5, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $5, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $10, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kandw %k6, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $6, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $9, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kandw %k7, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $7, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $8, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    movw $-257, %cx ## imm = 0xFEFF
-; X86-AVX512-NEXT:    kmovd %ecx, %k2
-; X86-AVX512-NEXT:    kandw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $8, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $7, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    movw $-513, %cx ## imm = 0xFDFF
-; X86-AVX512-NEXT:    kmovd %ecx, %k2
-; X86-AVX512-NEXT:    kandw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $9, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $6, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    movw $-1025, %cx ## imm = 0xFBFF
-; X86-AVX512-NEXT:    kmovd %ecx, %k2
-; X86-AVX512-NEXT:    kandw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $10, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $5, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    movw $-2049, %cx ## imm = 0xF7FF
-; X86-AVX512-NEXT:    kmovd %ecx, %k2
-; X86-AVX512-NEXT:    kandw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $11, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $4, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    movw $-4097, %cx ## imm = 0xEFFF
-; X86-AVX512-NEXT:    kmovd %ecx, %k2
-; X86-AVX512-NEXT:    kandw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $12, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $3, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    movw $-8193, %cx ## imm = 0xDFFF
-; X86-AVX512-NEXT:    kmovd %ecx, %k2
-; X86-AVX512-NEXT:    kandw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $13, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT:    kshiftrw $2, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    movw $-16385, %cx ## imm = 0xBFFF
-; X86-AVX512-NEXT:    kmovd %ecx, %k2
-; X86-AVX512-NEXT:    kandw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $14, %k0, %k2
-; X86-AVX512-NEXT:    kshiftlw $14, %k2, %k2
-; X86-AVX512-NEXT:    korw %k2, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrd $15, %k0, %k0
-; X86-AVX512-NEXT:    kshiftlw $1, %k1, %k1
-; X86-AVX512-NEXT:    kshiftrw $1, %k1, %k1
-; X86-AVX512-NEXT:    kshiftlw $15, %k0, %k0
-; X86-AVX512-NEXT:    korw %k0, %k1, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX512-NEXT:    vmovdqa64 (%edx), %zmm0
+; X86-AVX512-NEXT:    vmovdqa64 64(%edx), %zmm1
+; X86-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-AVX512-NEXT:    movw $21845, %dx ## imm = 0x5555
+; X86-AVX512-NEXT:    kmovd %edx, %k1
+; X86-AVX512-NEXT:    vpcmpgtd (%ecx), %zmm2, %k1 {%k1}
+; X86-AVX512-NEXT:    movw $85, %dx
+; X86-AVX512-NEXT:    kmovd %edx, %k2
+; X86-AVX512-NEXT:    vpcmpgtd 64(%ecx), %zmm2, %k2 {%k2}
+; X86-AVX512-NEXT:    vmovdqu32 %zmm1, 64(%eax) {%k2}
 ; X86-AVX512-NEXT:    vmovdqu32 %zmm0, (%eax) {%k1}
-; X86-AVX512-NEXT:    popl %eax
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
   %trigger = load <24 x i32>, ptr %trigger.ptr

diff  --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll
index f1a409a29db3..7f5adefc02ae 100644
--- a/llvm/test/CodeGen/X86/pr45563-2.ll
+++ b/llvm/test/CodeGen/X86/pr45563-2.ll
@@ -20,33 +20,31 @@ define <9 x float> @mload_split9(<9 x i1> %mask, ptr %addr, <9 x float> %dst) {
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT:    vmovd %esi, %xmm2
-; CHECK-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
-; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
-; CHECK-NEXT:    vpinsrw $3, %r8d, %xmm2, %xmm2
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT:    vpinsrw $4, %r9d, %xmm2, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm2
+; CHECK-NEXT:    vmovd %esi, %xmm1
+; CHECK-NEXT:    vpinsrb $1, %edx, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm1, %xmm2
+; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3
+; CHECK-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; CHECK-NEXT:    vmaskmovps (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vblendvps %ymm2, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vmovd %ecx, %xmm2
-; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT:    vmaskmovps 32(%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vblendvps %xmm2, %xmm3, %xmm1, %xmm1
-; CHECK-NEXT:    vmovss %xmm1, 32(%rax)
-; CHECK-NEXT:    vmovaps %ymm0, (%rax)
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT:    vmaskmovps (%rcx), %ymm1, %ymm2
+; CHECK-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u]
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT:    vmaskmovps 32(%rcx), %ymm1, %ymm2
+; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
+; CHECK-NEXT:    vblendvps %xmm1, %xmm2, %xmm4, %xmm0
+; CHECK-NEXT:    vmovss %xmm0, 32(%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call <9 x float> @llvm.masked.load.v9f32.p0(ptr %addr, i32 4, <9 x i1>%mask, <9 x float> %dst)
@@ -63,53 +61,47 @@ define <13 x float> @mload_split13(<13 x i1> %mask, ptr %addr, <13 x float> %dst
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm2
-; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],mem[0]
-; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT:    vmovd %esi, %xmm3
-; CHECK-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
-; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    vpinsrw $3, %r8d, %xmm3, %xmm3
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
-; CHECK-NEXT:    vpinsrw $4, %r9d, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $7, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; CHECK-NEXT:    vmaskmovps (%rdi), %ymm3, %ymm4
-; CHECK-NEXT:    vblendvps %ymm3, %ymm4, %ymm2, %ymm2
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vmovd %ecx, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $1, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $4, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm5
-; CHECK-NEXT:    vmaskmovps 32(%rdi), %ymm5, %ymm5
-; CHECK-NEXT:    vblendvps %xmm4, %xmm5, %xmm1, %xmm1
-; CHECK-NEXT:    vmovaps %xmm1, 32(%rax)
-; CHECK-NEXT:    vextractf128 $1, %ymm5, %xmm1
-; CHECK-NEXT:    vblendvps %xmm3, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vmovss %xmm0, 48(%rax)
-; CHECK-NEXT:    vmovaps %ymm2, (%rax)
+; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; CHECK-NEXT:    vmovd %esi, %xmm1
+; CHECK-NEXT:    vpinsrb $1, %edx, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm1, %xmm2
+; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3
+; CHECK-NEXT:    vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm4
+; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],mem[0]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT:    vmaskmovps (%rcx), %ymm1, %ymm2
+; CHECK-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmaskmovps 32(%rcx), %ymm3, %ymm3
+; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
+; CHECK-NEXT:    vblendvps %xmm1, %xmm3, %xmm5, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, 32(%rdi)
+; CHECK-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; CHECK-NEXT:    vblendvps %xmm2, %xmm0, %xmm6, %xmm0
+; CHECK-NEXT:    vmovss %xmm0, 48(%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call <13 x float> @llvm.masked.load.v13f32.p0(ptr %addr, i32 4, <13 x i1>%mask, <13 x float> %dst)
@@ -126,56 +118,49 @@ define <14 x float> @mload_split14(<14 x i1> %mask, ptr %addr, <14 x float> %dst
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm2
-; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT:    vmovd %esi, %xmm3
-; CHECK-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
-; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    vpinsrw $3, %r8d, %xmm3, %xmm3
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
-; CHECK-NEXT:    vpinsrw $4, %r9d, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $7, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; CHECK-NEXT:    vmaskmovps (%rdi), %ymm3, %ymm4
-; CHECK-NEXT:    vblendvps %ymm3, %ymm4, %ymm2, %ymm2
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vmovd %ecx, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $1, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $4, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm5
-; CHECK-NEXT:    vmaskmovps 32(%rdi), %ymm5, %ymm5
-; CHECK-NEXT:    vextractf128 $1, %ymm5, %xmm6
-; CHECK-NEXT:    vblendvps %xmm3, %xmm6, %xmm1, %xmm1
-; CHECK-NEXT:    vmovlps %xmm1, 48(%rax)
-; CHECK-NEXT:    vblendvps %xmm4, %xmm5, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, 32(%rax)
-; CHECK-NEXT:    vmovaps %ymm2, (%rax)
+; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; CHECK-NEXT:    vmovd %esi, %xmm1
+; CHECK-NEXT:    vpinsrb $1, %edx, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm1, %xmm2
+; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3
+; CHECK-NEXT:    vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3]
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT:    vmaskmovps (%rcx), %ymm1, %ymm2
+; CHECK-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm3[u],zero,xmm3[u]
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm3
+; CHECK-NEXT:    vmaskmovps 32(%rcx), %ymm3, %ymm3
+; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
+; CHECK-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; CHECK-NEXT:    vblendvps %xmm1, %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovlps %xmm0, 48(%rdi)
+; CHECK-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, 32(%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call <14 x float> @llvm.masked.load.v14f32.p0(ptr %addr, i32 4, <14 x i1>%mask, <14 x float> %dst)

diff  --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll
index baca0c05f267..d32fdfb9bb60 100644
--- a/llvm/test/CodeGen/X86/pr45833.ll
+++ b/llvm/test/CodeGen/X86/pr45833.ll
@@ -20,26 +20,24 @@ define void @mstore_split9(<9 x float> %value, ptr %addr, <9 x i1> %mask) {
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, 32(%rdi)
-; CHECK-NEXT:    vmovd %esi, %xmm1
-; CHECK-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vmovd %esi, %xmm2
+; CHECK-NEXT:    vpinsrb $1, %edx, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm2, %xmm3
+; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[8,u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u]
+; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
+; CHECK-NEXT:    vmaskmovps %ymm1, %ymm4, 32(%rdi)
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
 ; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -63,39 +61,33 @@ define void @mstore_split13(<13 x float> %value, ptr %addr, <13 x i1> %mask) {
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vmovd %esi, %xmm2
+; CHECK-NEXT:    vpinsrb $1, %edx, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm2, %xmm3
+; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4
+; CHECK-NEXT:    vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT:    vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm5
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, 32(%rdi)
-; CHECK-NEXT:    vmovd %esi, %xmm1
-; CHECK-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
+; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; CHECK-NEXT:    vmaskmovps %ymm0, %ymm2, (%rdi)
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    vmaskmovps %ymm1, %ymm0, 32(%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.masked.store.v13f32.p0(<13 x float> %value, ptr %addr, i32 4, <13 x i1>%mask)
@@ -119,41 +111,34 @@ define void @mstore_split14(<14 x float> %value, ptr %addr, <14 x i1> %mask) {
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vmovd %esi, %xmm2
+; CHECK-NEXT:    vpinsrb $1, %edx, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $3, %r8d, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $4, %r9d, %xmm2, %xmm3
+; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4
+; CHECK-NEXT:    vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT:    vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT:    vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, 32(%rdi)
-; CHECK-NEXT:    vmovd %esi, %xmm1
-; CHECK-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
+; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; CHECK-NEXT:    vmaskmovps %ymm0, %ymm2, (%rdi)
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm4[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm4[u],zero,xmm4[u]
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; CHECK-NEXT:    vmaskmovps %ymm1, %ymm0, 32(%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.masked.store.v14f32.p0(<14 x float> %value, ptr %addr, i32 4, <14 x i1>%mask)
@@ -228,15 +213,15 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) {
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm1
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
@@ -246,7 +231,6 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) {
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3
 ; CHECK-NEXT:    vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3
@@ -261,33 +245,35 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) {
 ; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
 ; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; CHECK-NEXT:    vmaskmovps %ymm2, %ymm3, 32(%rdi)
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; CHECK-NEXT:    vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmovd %esi, %xmm2
+; CHECK-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $6, %r8d, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $8, %r9d, %xmm2, %xmm2
 ; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
 ; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
 ; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, 64(%rdi)
-; CHECK-NEXT:    vmovd %esi, %xmm1
-; CHECK-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrb $6, %r8d, %xmm1, %xmm1
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, (%rdi)
+; CHECK-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; CHECK-NEXT:    vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
+; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, 64(%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.masked.store.v23f32.p0(<23 x float> %value, ptr %addr, i32 4, <23 x i1>%mask)


        


More information about the llvm-commits mailing list