[llvm] 377f27b - [X86] `DAGTypeLegalizer::ModifyToType()`: when widening w/ zeros, insert into undef and `and`-mask the padding away
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 24 10:27:30 PDT 2022
Author: Roman Lebedev
Date: 2022-10-24T20:27:02+03:00
New Revision: 377f27be87f92e82902be277bb02c401475c9aa1
URL: https://github.com/llvm/llvm-project/commit/377f27be87f92e82902be277bb02c401475c9aa1
DIFF: https://github.com/llvm/llvm-project/commit/377f27be87f92e82902be277bb02c401475c9aa1.diff
LOG: [X86] `DAGTypeLegalizer::ModifyToType()`: when widening w/ zeros, insert into undef and `and`-mask the padding away
We can expect that the sequence of inserting-of-extracts-into-undef
will be successfully lowered back into widening of the source vector,
but it seems that at least for X86 mask vectors, we have a really hard time
recovering from inserting-into-zero.
I've looked into alternative fix injection points, and they are much more
involved, by the time of `LowerBUILD_VECTORvXi1()`/`LowerINSERT_VECTOR_ELT()`
the constants might be obscured, so it does not seem like we can easily
deal with this by lowering into bit math later on,
some other pieces are missing.
Instead, it seems like just clearing the padding away via an `AND`-mask
is at least not a worse choice. Why create a problem where there wasn't one.
Though yes, it is possible that there are cases where constants originate
from the source IR, so some other fix may still be needed.
Reviewed By: pengfei
Differential Revision: https://reviews.llvm.org/D136046
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/test/CodeGen/X86/masked_gather_scatter.ll
llvm/test/CodeGen/X86/masked_store.ll
llvm/test/CodeGen/X86/pr45563-2.ll
llvm/test/CodeGen/X86/pr45833.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 89e9659137c9..4db646c09bbd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7036,7 +7036,7 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
unsigned InNumElts = InEC.getFixedValue();
unsigned WidenNumElts = WidenEC.getFixedValue();
- // Fall back to extract and build.
+ // Fall back to extract and build (+ mask, if padding with zeros).
SmallVector<SDValue, 16> Ops(WidenNumElts);
EVT EltVT = NVT.getVectorElementType();
unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
@@ -7045,9 +7045,21 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
DAG.getVectorIdxConstant(Idx, dl));
- SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
- DAG.getUNDEF(EltVT);
- for ( ; Idx < WidenNumElts; ++Idx)
- Ops[Idx] = FillVal;
- return DAG.getBuildVector(NVT, dl, Ops);
+ SDValue UndefVal = DAG.getUNDEF(EltVT);
+ for (; Idx < WidenNumElts; ++Idx)
+ Ops[Idx] = UndefVal;
+
+ SDValue Widened = DAG.getBuildVector(NVT, dl, Ops);
+ if (!FillWithZeroes)
+ return Widened;
+
+ assert(NVT.isInteger() &&
+ "We expect to never want to FillWithZeroes for non-integral types.");
+
+ SmallVector<SDValue, 16> MaskOps;
+ MaskOps.append(MinNumElts, DAG.getAllOnesConstant(dl, EltVT));
+ MaskOps.append(WidenNumElts - MinNumElts, DAG.getConstant(0, dl, EltVT));
+
+ return DAG.getNode(ISD::AND, dl, NVT, Widened,
+ DAG.getBuildVector(NVT, dl, MaskOps));
}
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 3d714f31ba72..f5e520e9b48b 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -2488,11 +2488,8 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
; KNL_64-LABEL: test30:
; KNL_64: # %bb.0:
; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
-; KNL_64-NEXT: movw $-3, %ax
-; KNL_64-NEXT: kmovw %eax, %k0
; KNL_64-NEXT: andl $1, %edi
-; KNL_64-NEXT: kmovw %edi, %k1
-; KNL_64-NEXT: kandw %k0, %k1, %k0
+; KNL_64-NEXT: kmovw %edi, %k0
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: kshiftlw $15, %k1, %k1
; KNL_64-NEXT: kshiftrw $14, %k1, %k1
@@ -2504,6 +2501,9 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
; KNL_64-NEXT: kshiftlw $15, %k1, %k1
; KNL_64-NEXT: kshiftrw $13, %k1, %k1
; KNL_64-NEXT: korw %k1, %k0, %k0
+; KNL_64-NEXT: movb $7, %al
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: kandw %k1, %k0, %k0
; KNL_64-NEXT: kshiftlw $12, %k0, %k0
; KNL_64-NEXT: kshiftrw $12, %k0, %k1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
@@ -2517,12 +2517,9 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
; KNL_32-LABEL: test30:
; KNL_32: # %bb.0:
; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_32-NEXT: movw $-3, %ax
-; KNL_32-NEXT: kmovw %eax, %k0
; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: andl $1, %eax
-; KNL_32-NEXT: kmovw %eax, %k1
-; KNL_32-NEXT: kandw %k0, %k1, %k0
+; KNL_32-NEXT: kmovw %eax, %k0
; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw %eax, %k1
; KNL_32-NEXT: kshiftlw $15, %k1, %k1
@@ -2536,6 +2533,9 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
; KNL_32-NEXT: kshiftlw $15, %k1, %k1
; KNL_32-NEXT: kshiftrw $13, %k1, %k1
; KNL_32-NEXT: korw %k1, %k0, %k0
+; KNL_32-NEXT: movb $7, %al
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kandw %k1, %k0, %k0
; KNL_32-NEXT: kshiftlw $12, %k0, %k0
; KNL_32-NEXT: kshiftrw $12, %k0, %k1
; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
@@ -2547,23 +2547,23 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
;
; SKX-LABEL: test30:
; SKX: # %bb.0:
-; SKX-NEXT: movb $-3, %al
-; SKX-NEXT: kmovw %eax, %k0
+; SKX-NEXT: kmovw %esi, %k0
+; SKX-NEXT: kshiftlb $7, %k0, %k0
+; SKX-NEXT: kshiftrb $6, %k0, %k0
; SKX-NEXT: kmovw %edi, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $7, %k1, %k1
-; SKX-NEXT: kandw %k0, %k1, %k0
-; SKX-NEXT: kmovw %esi, %k1
-; SKX-NEXT: kshiftlb $7, %k1, %k1
-; SKX-NEXT: kshiftrb $6, %k1, %k1
-; SKX-NEXT: korw %k1, %k0, %k0
+; SKX-NEXT: korw %k0, %k1, %k0
; SKX-NEXT: movb $-5, %al
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: kandw %k1, %k0, %k0
; SKX-NEXT: kmovw %edx, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $5, %k1, %k1
-; SKX-NEXT: korw %k1, %k0, %k1
+; SKX-NEXT: korw %k1, %k0, %k0
+; SKX-NEXT: movb $7, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kandw %k1, %k0, %k1
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
@@ -2574,18 +2574,15 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
;
; SKX_32-LABEL: test30:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: movb $-3, %al
+; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw %eax, %k0
+; SKX_32-NEXT: kshiftlb $7, %k0, %k0
+; SKX_32-NEXT: kshiftrb $6, %k0, %k0
; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw %eax, %k1
; SKX_32-NEXT: kshiftlb $7, %k1, %k1
; SKX_32-NEXT: kshiftrb $7, %k1, %k1
-; SKX_32-NEXT: kandw %k0, %k1, %k0
-; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kmovw %eax, %k1
-; SKX_32-NEXT: kshiftlb $7, %k1, %k1
-; SKX_32-NEXT: kshiftrb $6, %k1, %k1
-; SKX_32-NEXT: korw %k1, %k0, %k0
+; SKX_32-NEXT: korw %k0, %k1, %k0
; SKX_32-NEXT: movb $-5, %al
; SKX_32-NEXT: kmovw %eax, %k1
; SKX_32-NEXT: kandw %k1, %k0, %k0
@@ -2593,7 +2590,10 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
; SKX_32-NEXT: kmovw %eax, %k1
; SKX_32-NEXT: kshiftlb $7, %k1, %k1
; SKX_32-NEXT: kshiftrb $5, %k1, %k1
-; SKX_32-NEXT: korw %k1, %k0, %k1
+; SKX_32-NEXT: korw %k1, %k0, %k0
+; SKX_32-NEXT: movb $7, %al
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kandw %k1, %k0, %k1
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SKX_32-NEXT: vpgatherdd (,%xmm0), %xmm2 {%k1}
@@ -2612,11 +2612,8 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
; KNL_64-LABEL: test30b:
; KNL_64: # %bb.0:
; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
-; KNL_64-NEXT: movw $-3, %ax
-; KNL_64-NEXT: kmovw %eax, %k0
; KNL_64-NEXT: andl $1, %edi
-; KNL_64-NEXT: kmovw %edi, %k1
-; KNL_64-NEXT: kandw %k0, %k1, %k0
+; KNL_64-NEXT: kmovw %edi, %k0
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: kshiftlw $15, %k1, %k1
; KNL_64-NEXT: kshiftrw $14, %k1, %k1
@@ -2628,6 +2625,9 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
; KNL_64-NEXT: kshiftlw $15, %k1, %k1
; KNL_64-NEXT: kshiftrw $13, %k1, %k1
; KNL_64-NEXT: korw %k1, %k0, %k0
+; KNL_64-NEXT: movb $7, %al
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: kandw %k1, %k0, %k0
; KNL_64-NEXT: kshiftlw $12, %k0, %k0
; KNL_64-NEXT: kshiftrw $12, %k0, %k1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
@@ -2640,12 +2640,9 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
; KNL_32-LABEL: test30b:
; KNL_32: # %bb.0:
; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_32-NEXT: movw $-3, %ax
-; KNL_32-NEXT: kmovw %eax, %k0
; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: andl $1, %eax
-; KNL_32-NEXT: kmovw %eax, %k1
-; KNL_32-NEXT: kandw %k0, %k1, %k0
+; KNL_32-NEXT: kmovw %eax, %k0
; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw %eax, %k1
; KNL_32-NEXT: kshiftlw $15, %k1, %k1
@@ -2659,6 +2656,9 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
; KNL_32-NEXT: kshiftlw $15, %k1, %k1
; KNL_32-NEXT: kshiftrw $13, %k1, %k1
; KNL_32-NEXT: korw %k1, %k0, %k0
+; KNL_32-NEXT: movb $7, %al
+; KNL_32-NEXT: kmovw %eax, %k1
+; KNL_32-NEXT: kandw %k1, %k0, %k0
; KNL_32-NEXT: kshiftlw $12, %k0, %k0
; KNL_32-NEXT: kshiftrw $12, %k0, %k1
; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
@@ -2669,23 +2669,23 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
;
; SKX-LABEL: test30b:
; SKX: # %bb.0:
-; SKX-NEXT: movb $-3, %al
-; SKX-NEXT: kmovw %eax, %k0
+; SKX-NEXT: kmovw %esi, %k0
+; SKX-NEXT: kshiftlb $7, %k0, %k0
+; SKX-NEXT: kshiftrb $6, %k0, %k0
; SKX-NEXT: kmovw %edi, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $7, %k1, %k1
-; SKX-NEXT: kandw %k0, %k1, %k0
-; SKX-NEXT: kmovw %esi, %k1
-; SKX-NEXT: kshiftlb $7, %k1, %k1
-; SKX-NEXT: kshiftrb $6, %k1, %k1
-; SKX-NEXT: korw %k1, %k0, %k0
+; SKX-NEXT: korw %k0, %k1, %k0
; SKX-NEXT: movb $-5, %al
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: kandw %k1, %k0, %k0
; SKX-NEXT: kmovw %edx, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $5, %k1, %k1
-; SKX-NEXT: korw %k1, %k0, %k1
+; SKX-NEXT: korw %k1, %k0, %k0
+; SKX-NEXT: movb $7, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kandw %k1, %k0, %k1
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
@@ -2695,18 +2695,15 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
;
; SKX_32-LABEL: test30b:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: movb $-3, %al
+; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw %eax, %k0
+; SKX_32-NEXT: kshiftlb $7, %k0, %k0
+; SKX_32-NEXT: kshiftrb $6, %k0, %k0
; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw %eax, %k1
; SKX_32-NEXT: kshiftlb $7, %k1, %k1
; SKX_32-NEXT: kshiftrb $7, %k1, %k1
-; SKX_32-NEXT: kandw %k0, %k1, %k0
-; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kmovw %eax, %k1
-; SKX_32-NEXT: kshiftlb $7, %k1, %k1
-; SKX_32-NEXT: kshiftrb $6, %k1, %k1
-; SKX_32-NEXT: korw %k1, %k0, %k0
+; SKX_32-NEXT: korw %k0, %k1, %k0
; SKX_32-NEXT: movb $-5, %al
; SKX_32-NEXT: kmovw %eax, %k1
; SKX_32-NEXT: kandw %k1, %k0, %k0
@@ -2714,7 +2711,10 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
; SKX_32-NEXT: kmovw %eax, %k1
; SKX_32-NEXT: kshiftlb $7, %k1, %k1
; SKX_32-NEXT: kshiftrb $5, %k1, %k1
-; SKX_32-NEXT: korw %k1, %k0, %k1
+; SKX_32-NEXT: korw %k1, %k0, %k0
+; SKX_32-NEXT: movb $7, %al
+; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kandw %k1, %k0, %k1
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SKX_32-NEXT: vpscatterdd %xmm2, (,%xmm0) {%k1}
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index c6789dec3530..5bf48ce1bfc8 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -5324,11 +5324,8 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
; AVX512F-LABEL: widen_masked_store:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: movw $-3, %ax
-; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: andl $1, %esi
-; AVX512F-NEXT: kmovw %esi, %k1
-; AVX512F-NEXT: kandw %k0, %k1, %k0
+; AVX512F-NEXT: kmovw %esi, %k0
; AVX512F-NEXT: kmovw %edx, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $14, %k1, %k1
@@ -5340,6 +5337,9 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $13, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
+; AVX512F-NEXT: movb $7, %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
@@ -5348,33 +5348,30 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
;
; AVX512VLDQ-LABEL: widen_masked_store:
; AVX512VLDQ: ## %bb.0:
-; AVX512VLDQ-NEXT: movb $-3, %al
-; AVX512VLDQ-NEXT: kmovw %eax, %k0
+; AVX512VLDQ-NEXT: kmovw %edx, %k0
+; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0
+; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %esi, %k1
; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1
; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1
-; AVX512VLDQ-NEXT: kandw %k0, %k1, %k0
-; AVX512VLDQ-NEXT: kmovw %edx, %k1
-; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1
-; AVX512VLDQ-NEXT: kshiftrb $6, %k1, %k1
-; AVX512VLDQ-NEXT: korw %k1, %k0, %k0
+; AVX512VLDQ-NEXT: korw %k0, %k1, %k0
; AVX512VLDQ-NEXT: movb $-5, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: kandw %k1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %ecx, %k1
; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1
; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1
-; AVX512VLDQ-NEXT: korw %k1, %k0, %k1
+; AVX512VLDQ-NEXT: korw %k1, %k0, %k0
+; AVX512VLDQ-NEXT: movb $7, %al
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: kandw %k1, %k0, %k1
; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: widen_masked_store:
; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: movw $-3, %ax
-; AVX512VLBW-NEXT: kmovd %eax, %k0
; AVX512VLBW-NEXT: andl $1, %esi
-; AVX512VLBW-NEXT: kmovw %esi, %k1
-; AVX512VLBW-NEXT: kandw %k0, %k1, %k0
+; AVX512VLBW-NEXT: kmovw %esi, %k0
; AVX512VLBW-NEXT: kmovd %edx, %k1
; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1
; AVX512VLBW-NEXT: kshiftrw $14, %k1, %k1
@@ -5385,29 +5382,32 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
; AVX512VLBW-NEXT: kmovd %ecx, %k1
; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1
; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1
-; AVX512VLBW-NEXT: korw %k1, %k0, %k1
+; AVX512VLBW-NEXT: korw %k1, %k0, %k0
+; AVX512VLBW-NEXT: movb $7, %al
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: kandw %k1, %k0, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: widen_masked_store:
; X86-AVX512: ## %bb.0:
-; X86-AVX512-NEXT: movb $-3, %al
-; X86-AVX512-NEXT: kmovd %eax, %k0
+; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k0
+; X86-AVX512-NEXT: kshiftlb $7, %k0, %k0
+; X86-AVX512-NEXT: kshiftrb $6, %k0, %k0
; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1
; X86-AVX512-NEXT: kshiftrb $7, %k1, %k1
-; X86-AVX512-NEXT: kandw %k0, %k1, %k0
-; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1
-; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1
-; X86-AVX512-NEXT: kshiftrb $6, %k1, %k1
-; X86-AVX512-NEXT: korw %k1, %k0, %k0
+; X86-AVX512-NEXT: korw %k0, %k1, %k0
; X86-AVX512-NEXT: movb $-5, %al
; X86-AVX512-NEXT: kmovd %eax, %k1
; X86-AVX512-NEXT: kandw %k1, %k0, %k0
; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1
; X86-AVX512-NEXT: kshiftrb $5, %k1, %k1
-; X86-AVX512-NEXT: korw %k1, %k0, %k1
+; X86-AVX512-NEXT: korw %k1, %k0, %k0
+; X86-AVX512-NEXT: movb $7, %al
+; X86-AVX512-NEXT: kmovd %eax, %k1
+; X86-AVX512-NEXT: kandw %k1, %k0, %k1
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: vmovdqa32 %xmm0, (%eax) {%k1}
; X86-AVX512-NEXT: retl
@@ -6214,17 +6214,14 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0
; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k0
-; AVX512F-NEXT: movw $85, %ax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kandw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: movw $21845, %ax ## imm = 0x5555
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcmpgtd (%rdi), %zmm2, %k1 {%k1}
+; AVX512F-NEXT: movw $85, %ax
; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: vpcmpgtd (%rdi), %zmm2, %k2 {%k2}
-; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdx) {%k2}
-; AVX512F-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k1}
+; AVX512F-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2}
+; AVX512F-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k2}
+; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdx) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -6233,338 +6230,49 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
; AVX512VLDQ-NEXT: vmovdqa64 (%rsi), %zmm0
; AVX512VLDQ-NEXT: vmovdqa64 64(%rsi), %zmm1
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLDQ-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k0
-; AVX512VLDQ-NEXT: movw $85, %ax
-; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: kandb %k1, %k0, %k0
-; AVX512VLDQ-NEXT: kmovb %k0, %k1
; AVX512VLDQ-NEXT: movw $21845, %ax ## imm = 0x5555
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vpcmpgtd (%rdi), %zmm2, %k1 {%k1}
+; AVX512VLDQ-NEXT: movw $85, %ax
; AVX512VLDQ-NEXT: kmovw %eax, %k2
-; AVX512VLDQ-NEXT: vpcmpgtd (%rdi), %zmm2, %k2 {%k2}
-; AVX512VLDQ-NEXT: vmovdqu32 %zmm0, (%rdx) {%k2}
-; AVX512VLDQ-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k1}
+; AVX512VLDQ-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2}
+; AVX512VLDQ-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k2}
+; AVX512VLDQ-NEXT: vmovdqu32 %zmm0, (%rdx) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
-; AVX512VLBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512VLBW-NEXT: vpcmpgtd 64(%rdi), %zmm0, %k0
-; AVX512VLBW-NEXT: kunpckwd %k1, %k0, %k0
-; AVX512VLBW-NEXT: movl $5592405, %eax ## imm = 0x555555
-; AVX512VLBW-NEXT: kmovd %eax, %k2
-; AVX512VLBW-NEXT: kandd %k2, %k0, %k0
-; AVX512VLBW-NEXT: kshiftrd $21, %k0, %k6
-; AVX512VLBW-NEXT: kshiftrd $20, %k0, %k5
-; AVX512VLBW-NEXT: kshiftrd $19, %k0, %k4
-; AVX512VLBW-NEXT: kshiftrd $18, %k0, %k3
-; AVX512VLBW-NEXT: kshiftrd $16, %k0, %k2
-; AVX512VLBW-NEXT: kshiftrd $17, %k0, %k7
-; AVX512VLBW-NEXT: kshiftlw $15, %k7, %k7
-; AVX512VLBW-NEXT: kshiftrw $14, %k7, %k7
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $15, %k2, %k2
-; AVX512VLBW-NEXT: korw %k7, %k2, %k7
-; AVX512VLBW-NEXT: movw $-5, %ax
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; AVX512VLBW-NEXT: kandw %k1, %k7, %k7
-; AVX512VLBW-NEXT: kshiftlw $15, %k3, %k3
-; AVX512VLBW-NEXT: kshiftrw $13, %k3, %k3
-; AVX512VLBW-NEXT: korw %k3, %k7, %k7
-; AVX512VLBW-NEXT: movw $-9, %ax
-; AVX512VLBW-NEXT: kmovd %eax, %k3
-; AVX512VLBW-NEXT: kandw %k3, %k7, %k7
-; AVX512VLBW-NEXT: kshiftlw $15, %k4, %k4
-; AVX512VLBW-NEXT: kshiftrw $12, %k4, %k4
-; AVX512VLBW-NEXT: korw %k4, %k7, %k7
-; AVX512VLBW-NEXT: movw $-17, %ax
-; AVX512VLBW-NEXT: kmovd %eax, %k4
-; AVX512VLBW-NEXT: kandw %k4, %k7, %k7
-; AVX512VLBW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512VLBW-NEXT: kshiftrw $11, %k5, %k5
-; AVX512VLBW-NEXT: korw %k5, %k7, %k7
-; AVX512VLBW-NEXT: movw $-33, %ax
-; AVX512VLBW-NEXT: kmovd %eax, %k5
-; AVX512VLBW-NEXT: kandw %k5, %k7, %k7
-; AVX512VLBW-NEXT: kshiftlw $15, %k6, %k6
-; AVX512VLBW-NEXT: kshiftrw $10, %k6, %k6
-; AVX512VLBW-NEXT: korw %k6, %k7, %k7
-; AVX512VLBW-NEXT: movw $-65, %ax
-; AVX512VLBW-NEXT: kmovd %eax, %k6
-; AVX512VLBW-NEXT: kandw %k6, %k7, %k7
-; AVX512VLBW-NEXT: kshiftrd $22, %k0, %k1
-; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrw $9, %k1, %k1
-; AVX512VLBW-NEXT: korw %k1, %k7, %k1
-; AVX512VLBW-NEXT: movw $-129, %ax
-; AVX512VLBW-NEXT: kmovd %eax, %k7
-; AVX512VLBW-NEXT: kandw %k7, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $23, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $8, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
; AVX512VLBW-NEXT: vmovdqa64 (%rsi), %zmm0
; AVX512VLBW-NEXT: vmovdqa64 64(%rsi), %zmm1
-; AVX512VLBW-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k1}
-; AVX512VLBW-NEXT: kshiftrd $1, %k0, %k1
-; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrw $14, %k1, %k1
-; AVX512VLBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $15, %k2, %k2
-; AVX512VLBW-NEXT: korw %k1, %k2, %k1
-; AVX512VLBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload
-; AVX512VLBW-NEXT: kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $2, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $13, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kandw %k3, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $3, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $12, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kandw %k4, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $4, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $11, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kandw %k5, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $5, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $10, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kandw %k6, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $6, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $9, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kandw %k7, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $7, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $8, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: movw $-257, %ax ## imm = 0xFEFF
-; AVX512VLBW-NEXT: kmovd %eax, %k2
-; AVX512VLBW-NEXT: kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $8, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $7, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: movw $-513, %ax ## imm = 0xFDFF
-; AVX512VLBW-NEXT: kmovd %eax, %k2
-; AVX512VLBW-NEXT: kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $9, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $6, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: movw $-1025, %ax ## imm = 0xFBFF
-; AVX512VLBW-NEXT: kmovd %eax, %k2
-; AVX512VLBW-NEXT: kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $10, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $5, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: movw $-2049, %ax ## imm = 0xF7FF
-; AVX512VLBW-NEXT: kmovd %eax, %k2
-; AVX512VLBW-NEXT: kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $11, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $4, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: movw $-4097, %ax ## imm = 0xEFFF
-; AVX512VLBW-NEXT: kmovd %eax, %k2
-; AVX512VLBW-NEXT: kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $12, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $3, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: movw $-8193, %ax ## imm = 0xDFFF
-; AVX512VLBW-NEXT: kmovd %eax, %k2
-; AVX512VLBW-NEXT: kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $13, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512VLBW-NEXT: kshiftrw $2, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: movw $-16385, %ax ## imm = 0xBFFF
+; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT: movw $21845, %ax ## imm = 0x5555
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vpcmpgtd (%rdi), %zmm2, %k1 {%k1}
+; AVX512VLBW-NEXT: movw $85, %ax
; AVX512VLBW-NEXT: kmovd %eax, %k2
-; AVX512VLBW-NEXT: kandw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $14, %k0, %k2
-; AVX512VLBW-NEXT: kshiftlw $14, %k2, %k2
-; AVX512VLBW-NEXT: korw %k2, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrd $15, %k0, %k0
-; AVX512VLBW-NEXT: kshiftlw $1, %k1, %k1
-; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512VLBW-NEXT: korw %k0, %k1, %k1
+; AVX512VLBW-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2}
+; AVX512VLBW-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k2}
; AVX512VLBW-NEXT: vmovdqu32 %zmm0, (%rdx) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
; X86-AVX512: ## %bb.0:
-; X86-AVX512-NEXT: pushl %eax
-; X86-AVX512-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; X86-AVX512-NEXT: vpcmpgtd (%eax), %zmm0, %k1
-; X86-AVX512-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
-; X86-AVX512-NEXT: vpcmpgtd 64(%eax), %zmm0, %k0
-; X86-AVX512-NEXT: kunpckwd %k1, %k0, %k0
-; X86-AVX512-NEXT: movl $5592405, %eax ## imm = 0x555555
-; X86-AVX512-NEXT: kmovd %eax, %k2
-; X86-AVX512-NEXT: kandd %k2, %k0, %k0
-; X86-AVX512-NEXT: kshiftrd $21, %k0, %k6
-; X86-AVX512-NEXT: kshiftrd $20, %k0, %k5
-; X86-AVX512-NEXT: kshiftrd $19, %k0, %k4
-; X86-AVX512-NEXT: kshiftrd $18, %k0, %k3
-; X86-AVX512-NEXT: kshiftrd $16, %k0, %k2
-; X86-AVX512-NEXT: kshiftrd $17, %k0, %k7
-; X86-AVX512-NEXT: kshiftlw $15, %k7, %k7
-; X86-AVX512-NEXT: kshiftrw $14, %k7, %k7
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $15, %k2, %k2
-; X86-AVX512-NEXT: korw %k7, %k2, %k7
-; X86-AVX512-NEXT: movw $-5, %ax
-; X86-AVX512-NEXT: kmovd %eax, %k1
-; X86-AVX512-NEXT: kmovw %k1, (%esp) ## 2-byte Spill
-; X86-AVX512-NEXT: kandw %k1, %k7, %k7
-; X86-AVX512-NEXT: kshiftlw $15, %k3, %k3
-; X86-AVX512-NEXT: kshiftrw $13, %k3, %k3
-; X86-AVX512-NEXT: korw %k3, %k7, %k7
-; X86-AVX512-NEXT: movw $-9, %ax
-; X86-AVX512-NEXT: kmovd %eax, %k3
-; X86-AVX512-NEXT: kandw %k3, %k7, %k7
-; X86-AVX512-NEXT: kshiftlw $15, %k4, %k4
-; X86-AVX512-NEXT: kshiftrw $12, %k4, %k4
-; X86-AVX512-NEXT: korw %k4, %k7, %k7
-; X86-AVX512-NEXT: movw $-17, %ax
-; X86-AVX512-NEXT: kmovd %eax, %k4
-; X86-AVX512-NEXT: kandw %k4, %k7, %k7
-; X86-AVX512-NEXT: kshiftlw $15, %k5, %k5
-; X86-AVX512-NEXT: kshiftrw $11, %k5, %k5
-; X86-AVX512-NEXT: korw %k5, %k7, %k7
-; X86-AVX512-NEXT: movw $-33, %ax
-; X86-AVX512-NEXT: kmovd %eax, %k5
-; X86-AVX512-NEXT: kandw %k5, %k7, %k7
-; X86-AVX512-NEXT: kshiftlw $15, %k6, %k6
-; X86-AVX512-NEXT: kshiftrw $10, %k6, %k6
-; X86-AVX512-NEXT: korw %k6, %k7, %k7
-; X86-AVX512-NEXT: movw $-65, %ax
-; X86-AVX512-NEXT: kmovd %eax, %k6
-; X86-AVX512-NEXT: kandw %k6, %k7, %k7
-; X86-AVX512-NEXT: kshiftrd $22, %k0, %k1
-; X86-AVX512-NEXT: kshiftlw $15, %k1, %k1
-; X86-AVX512-NEXT: kshiftrw $9, %k1, %k1
-; X86-AVX512-NEXT: korw %k1, %k7, %k1
-; X86-AVX512-NEXT: movw $-129, %ax
-; X86-AVX512-NEXT: kmovd %eax, %k7
-; X86-AVX512-NEXT: kandw %k7, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $23, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $8, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: vmovdqa64 (%ecx), %zmm0
-; X86-AVX512-NEXT: vmovdqa64 64(%ecx), %zmm1
-; X86-AVX512-NEXT: vmovdqu32 %zmm1, 64(%eax) {%k1}
-; X86-AVX512-NEXT: kshiftrd $1, %k0, %k1
-; X86-AVX512-NEXT: kshiftlw $15, %k1, %k1
-; X86-AVX512-NEXT: kshiftrw $14, %k1, %k1
-; X86-AVX512-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $15, %k2, %k2
-; X86-AVX512-NEXT: korw %k1, %k2, %k1
-; X86-AVX512-NEXT: kmovw (%esp), %k2 ## 2-byte Reload
-; X86-AVX512-NEXT: kandw %k2, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $2, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $13, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: kandw %k3, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $3, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $12, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: kandw %k4, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $4, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $11, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: kandw %k5, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $5, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $10, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: kandw %k6, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $6, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $9, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: kandw %k7, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $7, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $8, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: movw $-257, %cx ## imm = 0xFEFF
-; X86-AVX512-NEXT: kmovd %ecx, %k2
-; X86-AVX512-NEXT: kandw %k2, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $8, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $7, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: movw $-513, %cx ## imm = 0xFDFF
-; X86-AVX512-NEXT: kmovd %ecx, %k2
-; X86-AVX512-NEXT: kandw %k2, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $9, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $6, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: movw $-1025, %cx ## imm = 0xFBFF
-; X86-AVX512-NEXT: kmovd %ecx, %k2
-; X86-AVX512-NEXT: kandw %k2, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $10, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $5, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: movw $-2049, %cx ## imm = 0xF7FF
-; X86-AVX512-NEXT: kmovd %ecx, %k2
-; X86-AVX512-NEXT: kandw %k2, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $11, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $4, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: movw $-4097, %cx ## imm = 0xEFFF
-; X86-AVX512-NEXT: kmovd %ecx, %k2
-; X86-AVX512-NEXT: kandw %k2, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $12, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $3, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: movw $-8193, %cx ## imm = 0xDFFF
-; X86-AVX512-NEXT: kmovd %ecx, %k2
-; X86-AVX512-NEXT: kandw %k2, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $13, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2
-; X86-AVX512-NEXT: kshiftrw $2, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: movw $-16385, %cx ## imm = 0xBFFF
-; X86-AVX512-NEXT: kmovd %ecx, %k2
-; X86-AVX512-NEXT: kandw %k2, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $14, %k0, %k2
-; X86-AVX512-NEXT: kshiftlw $14, %k2, %k2
-; X86-AVX512-NEXT: korw %k2, %k1, %k1
-; X86-AVX512-NEXT: kshiftrd $15, %k0, %k0
-; X86-AVX512-NEXT: kshiftlw $1, %k1, %k1
-; X86-AVX512-NEXT: kshiftrw $1, %k1, %k1
-; X86-AVX512-NEXT: kshiftlw $15, %k0, %k0
-; X86-AVX512-NEXT: korw %k0, %k1, %k1
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX512-NEXT: vmovdqa64 (%edx), %zmm0
+; X86-AVX512-NEXT: vmovdqa64 64(%edx), %zmm1
+; X86-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X86-AVX512-NEXT: movw $21845, %dx ## imm = 0x5555
+; X86-AVX512-NEXT: kmovd %edx, %k1
+; X86-AVX512-NEXT: vpcmpgtd (%ecx), %zmm2, %k1 {%k1}
+; X86-AVX512-NEXT: movw $85, %dx
+; X86-AVX512-NEXT: kmovd %edx, %k2
+; X86-AVX512-NEXT: vpcmpgtd 64(%ecx), %zmm2, %k2 {%k2}
+; X86-AVX512-NEXT: vmovdqu32 %zmm1, 64(%eax) {%k2}
; X86-AVX512-NEXT: vmovdqu32 %zmm0, (%eax) {%k1}
-; X86-AVX512-NEXT: popl %eax
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%trigger = load <24 x i32>, ptr %trigger.ptr
diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll
index f1a409a29db3..7f5adefc02ae 100644
--- a/llvm/test/CodeGen/X86/pr45563-2.ll
+++ b/llvm/test/CodeGen/X86/pr45563-2.ll
@@ -20,33 +20,31 @@ define <9 x float> @mload_split9(<9 x i1> %mask, ptr %addr, <9 x float> %dst) {
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT: vmovd %esi, %xmm2
-; CHECK-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2
-; CHECK-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; CHECK-NEXT: vpinsrw $3, %r8d, %xmm2, %xmm2
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT: vpinsrw $4, %r9d, %xmm2, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm2
+; CHECK-NEXT: vmovd %esi, %xmm1
+; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2
+; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3
+; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; CHECK-NEXT: vmaskmovps (%rdi), %ymm2, %ymm3
-; CHECK-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vmovd %ecx, %xmm2
-; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm2, %ymm3
-; CHECK-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
-; CHECK-NEXT: vmovss %xmm1, 32(%rax)
-; CHECK-NEXT: vmovaps %ymm0, (%rax)
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2
+; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u]
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm1, %ymm2
+; CHECK-NEXT: vmovaps %ymm0, (%rdi)
+; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm4, %xmm0
+; CHECK-NEXT: vmovss %xmm0, 32(%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <9 x float> @llvm.masked.load.v9f32.p0(ptr %addr, i32 4, <9 x i1>%mask, <9 x float> %dst)
@@ -63,53 +61,47 @@ define <13 x float> @mload_split13(<13 x i1> %mask, ptr %addr, <13 x float> %dst
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],mem[0]
-; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT: vmovd %esi, %xmm3
-; CHECK-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3
-; CHECK-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: vpinsrw $3, %r8d, %xmm3, %xmm3
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; CHECK-NEXT: vpslld $31, %xmm4, %xmm4
-; CHECK-NEXT: vpinsrw $4, %r9d, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; CHECK-NEXT: vmaskmovps (%rdi), %ymm3, %ymm4
-; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm2, %ymm2
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vmovd %ecx, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; CHECK-NEXT: vpslld $31, %xmm4, %xmm4
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm5
-; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm5, %ymm5
-; CHECK-NEXT: vblendvps %xmm4, %xmm5, %xmm1, %xmm1
-; CHECK-NEXT: vmovaps %xmm1, 32(%rax)
-; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm1
-; CHECK-NEXT: vblendvps %xmm3, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovss %xmm0, 48(%rax)
-; CHECK-NEXT: vmovaps %ymm2, (%rax)
+; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; CHECK-NEXT: vmovd %esi, %xmm1
+; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2
+; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3
+; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm4
+; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],mem[0]
+; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2
+; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3
+; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3
+; CHECK-NEXT: vmovaps %ymm0, (%rdi)
+; CHECK-NEXT: vblendvps %xmm1, %xmm3, %xmm5, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, 32(%rdi)
+; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0
+; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm6, %xmm0
+; CHECK-NEXT: vmovss %xmm0, 48(%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <13 x float> @llvm.masked.load.v13f32.p0(ptr %addr, i32 4, <13 x i1>%mask, <13 x float> %dst)
@@ -126,56 +118,49 @@ define <14 x float> @mload_split14(<14 x i1> %mask, ptr %addr, <14 x float> %dst
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT: vmovd %esi, %xmm3
-; CHECK-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3
-; CHECK-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: vpinsrw $3, %r8d, %xmm3, %xmm3
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; CHECK-NEXT: vpslld $31, %xmm4, %xmm4
-; CHECK-NEXT: vpinsrw $4, %r9d, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; CHECK-NEXT: vmaskmovps (%rdi), %ymm3, %ymm4
-; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm2, %ymm2
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vmovd %ecx, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; CHECK-NEXT: vpslld $31, %xmm4, %xmm4
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm5
-; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm5, %ymm5
-; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm6
-; CHECK-NEXT: vblendvps %xmm3, %xmm6, %xmm1, %xmm1
-; CHECK-NEXT: vmovlps %xmm1, 48(%rax)
-; CHECK-NEXT: vblendvps %xmm4, %xmm5, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, 32(%rax)
-; CHECK-NEXT: vmovaps %ymm2, (%rax)
+; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; CHECK-NEXT: vmovd %esi, %xmm1
+; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2
+; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3
+; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3]
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2
+; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm3[u],zero,xmm3[u]
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3
+; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3
+; CHECK-NEXT: vmovaps %ymm0, (%rdi)
+; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0
+; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm5, %xmm0
+; CHECK-NEXT: vmovlps %xmm0, 48(%rdi)
+; CHECK-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, 32(%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <14 x float> @llvm.masked.load.v14f32.p0(ptr %addr, i32 4, <14 x i1>%mask, <14 x float> %dst)
diff --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll
index baca0c05f267..d32fdfb9bb60 100644
--- a/llvm/test/CodeGen/X86/pr45833.ll
+++ b/llvm/test/CodeGen/X86/pr45833.ll
@@ -20,26 +20,24 @@ define void @mstore_split9(<9 x float> %value, ptr %addr, <9 x i1> %mask) {
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi)
-; CHECK-NEXT: vmovd %esi, %xmm1
-; CHECK-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1
-; CHECK-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; CHECK-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vmovd %esi, %xmm2
+; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3
+; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4
+; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u]
+; CHECK-NEXT: vpslld $31, %xmm4, %xmm4
+; CHECK-NEXT: vmaskmovps %ymm1, %ymm4, 32(%rdi)
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -63,39 +61,33 @@ define void @mstore_split13(<13 x float> %value, ptr %addr, <13 x i1> %mask) {
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vmovd %esi, %xmm2
+; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3
+; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4
+; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm5
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi)
-; CHECK-NEXT: vmovd %esi, %xmm1
-; CHECK-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1
-; CHECK-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; CHECK-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
+; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi)
+; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v13f32.p0(<13 x float> %value, ptr %addr, i32 4, <13 x i1>%mask)
@@ -119,41 +111,34 @@ define void @mstore_split14(<14 x float> %value, ptr %addr, <14 x i1> %mask) {
; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vmovd %esi, %xmm2
+; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3
+; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4
+; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi)
-; CHECK-NEXT: vmovd %esi, %xmm1
-; CHECK-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1
-; CHECK-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; CHECK-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
+; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi)
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm4[u],zero,xmm4[u]
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v14f32.p0(<14 x float> %value, ptr %addr, i32 4, <14 x i1>%mask)
@@ -228,15 +213,15 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) {
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
@@ -246,7 +231,6 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) {
; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3
; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3
@@ -261,33 +245,35 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) {
; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; CHECK-NEXT: vmaskmovps %ymm2, %ymm3, 32(%rdi)
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vmovd %esi, %xmm2
+; CHECK-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $6, %r8d, %xmm2, %xmm2
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $8, %r9d, %xmm2, %xmm2
; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 64(%rdi)
-; CHECK-NEXT: vmovd %esi, %xmm1
-; CHECK-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
-; CHECK-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; CHECK-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, (%rdi)
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
-; CHECK-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
-; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, 64(%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v23f32.p0(<23 x float> %value, ptr %addr, i32 4, <23 x i1>%mask)
More information about the llvm-commits
mailing list