[llvm] DAG: Avoid forming shufflevector from a single extract_vector_elt (PR #122672)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 16 17:42:04 PST 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/122672
>From f7485059c6ff1623f2195f509932628cfc3c07ad Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 10 Jan 2025 21:13:09 +0700
Subject: [PATCH 1/5] DAG: Avoid forming shufflevector from a single
extract_vector_elt
This avoids regressions in a future AMDGPU commit. Previously we
would have a build_vector (extract_vector_elt x), undef with free
access to the elements bloated into a shuffle of one element + undef,
which has much worse combine support than the extract.
Alternatively could check aggressivelyPreferBuildVectorSources, but
I'm not sure it's really different than isExtractVecEltCheap.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 +++++++++++++++----
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 10 ++++----
llvm/test/CodeGen/X86/avx512-build-vector.ll | 8 +++---
.../CodeGen/X86/insertelement-duplicates.ll | 10 +++-----
llvm/test/CodeGen/X86/sse-align-12.ll | 4 +--
5 files changed, 34 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 58ab99e0dcdeee..1d27af91511629 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23807,6 +23807,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
SmallVector<SDValue, 8> VecIn;
VecIn.push_back(SDValue());
+ // If we have a single extract_element with a constant index, track the index
+ // value.
+ unsigned OneConstExtractIndex = ~0u;
+
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Op = N->getOperand(i);
@@ -23824,16 +23828,18 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// Not an undef or zero. If the input is something other than an
// EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
- if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- !isa<ConstantSDNode>(Op.getOperand(1)))
+ if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
- SDValue ExtractedFromVec = Op.getOperand(0);
+ SDValue ExtractedFromVec = Op.getOperand(0);
if (ExtractedFromVec.getValueType().isScalableVector())
return SDValue();
+ auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!ExtractIdx)
+ return SDValue();
- const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
- if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
+ if (ExtractIdx->getAsAPIntVal().uge(
+ ExtractedFromVec.getValueType().getVectorNumElements()))
return SDValue();
// All inputs must have the same element type as the output.
@@ -23841,6 +23847,8 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
ExtractedFromVec.getValueType().getVectorElementType())
return SDValue();
+ OneConstExtractIndex = ExtractIdx->getZExtValue();
+
// Have we seen this input vector before?
// The vectors are expected to be tiny (usually 1 or 2 elements), so using
// a map back from SDValues to numbers isn't worth it.
@@ -23863,6 +23871,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// VecIn accordingly.
bool DidSplitVec = false;
if (VecIn.size() == 2) {
+ // If we only found a single constant indexed extract_vector_elt feeding the
+ // build_vector, do not produce a more complicated shuffle if the extract is
+ // cheap.
+ if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
+ TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
+ return SDValue();
+
unsigned MaxIndex = 0;
unsigned NearestPow2 = 0;
SDValue Vec = VecIn.back();
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 7912d1cf8dc0d1..add8c0f75bf335 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index b21a0c4e36c2bd..27cb3eb406e9e8 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -14,11 +14,9 @@ define <16 x i32> @test2(<16 x i32> %x) {
define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
-; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
-; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11,0,1,2,3],zero,zero,zero,zero
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%b = extractelement <4 x float> %a, i32 2
%c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll
index 435ea61412b73e..3da53a4ca5f1b9 100644
--- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll
+++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll
@@ -31,18 +31,16 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli
; AVX-32: # %bb.0: # %L.entry
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0
-; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
+; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
; AVX-32-NEXT: vmovups %ymm0, 608(%eax)
; AVX-32-NEXT: vzeroupper
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: PR15298:
; AVX-64: # %bb.0: # %L.entry
-; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0
-; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
+; AVX-64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
; AVX-64-NEXT: vmovups %ymm0, 608(%rsi)
; AVX-64-NEXT: vzeroupper
; AVX-64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sse-align-12.ll b/llvm/test/CodeGen/X86/sse-align-12.ll
index 7b4bd3ffdf00c5..0d5bdb0954ce31 100644
--- a/llvm/test/CodeGen/X86/sse-align-12.ll
+++ b/llvm/test/CodeGen/X86/sse-align-12.ll
@@ -40,8 +40,8 @@ define <4 x float> @b(ptr %y, <4 x float> %z) nounwind {
define <2 x double> @c(ptr %y) nounwind {
; CHECK-LABEL: c:
; CHECK: # %bb.0:
-; CHECK-NEXT: movups (%rdi), %xmm0
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; CHECK-NEXT: retq
%x = load <2 x double>, ptr %y, align 8
%a = extractelement <2 x double> %x, i32 0
>From a4823b0ea585d7a96e25870b4589b4086da4f6d8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 15 Jan 2025 15:43:54 +0700
Subject: [PATCH 2/5] Check hasOneUse on the source vector. This gives up some
of the x86 improvements
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++++-
llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll | 10 +++++-----
llvm/test/CodeGen/X86/avx512-build-vector.ll | 8 +++++---
llvm/test/CodeGen/X86/sse-align-12.ll | 4 ++--
4 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1d27af91511629..70d2947821c061 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23874,8 +23874,12 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// If we only found a single constant indexed extract_vector_elt feeding the
// build_vector, do not produce a more complicated shuffle if the extract is
// cheap.
+
+ // TODO: This should be more aggressive about skipping the shuffle formation
+ // (e.g., always do this for VecIn[1]->hasOneUse())
if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
- TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
+ (VecIn[1].hasOneUse() &&
+ TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)))
return SDValue();
unsigned MaxIndex = 0;
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index add8c0f75bf335..7912d1cf8dc0d1 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index 27cb3eb406e9e8..b21a0c4e36c2bd 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -14,9 +14,11 @@ define <16 x i32> @test2(<16 x i32> %x) {
define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11,0,1,2,3],zero,zero,zero,zero
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
+; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%b = extractelement <4 x float> %a, i32 2
%c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
diff --git a/llvm/test/CodeGen/X86/sse-align-12.ll b/llvm/test/CodeGen/X86/sse-align-12.ll
index 0d5bdb0954ce31..7b4bd3ffdf00c5 100644
--- a/llvm/test/CodeGen/X86/sse-align-12.ll
+++ b/llvm/test/CodeGen/X86/sse-align-12.ll
@@ -40,8 +40,8 @@ define <4 x float> @b(ptr %y, <4 x float> %z) nounwind {
define <2 x double> @c(ptr %y) nounwind {
; CHECK-LABEL: c:
; CHECK: # %bb.0:
-; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; CHECK-NEXT: movups (%rdi), %xmm0
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: retq
%x = load <2 x double>, ptr %y, align 8
%a = extractelement <2 x double> %x, i32 0
>From e845f03322548ff35a61f4e9033e0de7de22860f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 16 Jan 2025 13:47:27 +0700
Subject: [PATCH 3/5] Count the number of extract uses
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 +-
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 10 +-
.../any_extend_vector_inreg_of_broadcast.ll | 240 +++++-----
...d_vector_inreg_of_broadcast_from_memory.ll | 118 +++--
llvm/test/CodeGen/X86/buildvec-extract.ll | 39 +-
.../CodeGen/X86/insertelement-duplicates.ll | 10 +-
llvm/test/CodeGen/X86/movmsk-bittest.ll | 27 +-
.../CodeGen/X86/split-extend-vector-inreg.ll | 46 +-
llvm/test/CodeGen/X86/sse41.ll | 92 ++--
llvm/test/CodeGen/X86/vec_extract-avx.ll | 4 +-
.../vector-interleaved-store-i32-stride-5.ll | 26 +-
llvm/test/CodeGen/X86/vector-narrow-binop.ll | 9 +-
.../zero_extend_vector_inreg_of_broadcast.ll | 420 +++++++++---------
...d_vector_inreg_of_broadcast_from_memory.ll | 325 ++++++--------
14 files changed, 681 insertions(+), 699 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 70d2947821c061..d5d8336cc2c47c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23811,6 +23811,8 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// value.
unsigned OneConstExtractIndex = ~0u;
+ unsigned NumExtracts = 0;
+
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Op = N->getOperand(i);
@@ -23847,7 +23849,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
ExtractedFromVec.getValueType().getVectorElementType())
return SDValue();
- OneConstExtractIndex = ExtractIdx->getZExtValue();
+ if (OneConstExtractIndex == ~0u)
+ OneConstExtractIndex = ExtractIdx->getZExtValue();
+
+ ++NumExtracts;
// Have we seen this input vector before?
// The vectors are expected to be tiny (usually 1 or 2 elements), so using
@@ -23878,8 +23883,11 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// TODO: This should be more aggressive about skipping the shuffle formation
// (e.g., always do this for VecIn[1]->hasOneUse())
if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
- (VecIn[1].hasOneUse() &&
- TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)))
+ TLI.isTypeLegal(VT.getVectorElementType()) &&
+ // VecIn[1].hasOneUse() &&
+ NumExtracts == 1
+ //&& TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
+ )
return SDValue();
unsigned MaxIndex = 0;
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 7912d1cf8dc0d1..add8c0f75bf335 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index cad1d09f11d9c3..940c1ca2d4d355 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -3817,21 +3817,21 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 48(%rsi), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT: paddb (%rdx), %xmm2
-; SSE2-NEXT: paddb 16(%rdx), %xmm0
-; SSE2-NEXT: paddb 32(%rdx), %xmm1
-; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm2, (%rcx)
+; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,0,65535]
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: paddb (%rdx), %xmm4
+; SSE2-NEXT: paddb 16(%rdx), %xmm3
+; SSE2-NEXT: paddb 32(%rdx), %xmm2
+; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm4, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -3840,16 +3840,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: paddb (%rdx), %xmm2
-; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: paddb 32(%rdx), %xmm1
-; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
-; SSE42-NEXT: movdqa %xmm2, (%rcx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: paddb 16(%rdx), %xmm3
+; SSE42-NEXT: paddb 32(%rdx), %xmm2
+; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -3858,15 +3858,15 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: retq
;
@@ -3875,7 +3875,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
@@ -3985,20 +3985,16 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
+; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -4079,29 +4075,29 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT: paddb (%rdx), %xmm2
-; SSE2-NEXT: paddb 16(%rdx), %xmm0
-; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm2, (%rcx)
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: paddb (%rdx), %xmm3
+; SSE2-NEXT: paddb 16(%rdx), %xmm2
+; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm3, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 48(%rsi), %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: paddb (%rdx), %xmm1
-; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: paddb 16(%rdx), %xmm2
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
@@ -4339,33 +4335,33 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: paddb (%rdx), %xmm2
-; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: paddb 32(%rdx), %xmm1
-; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
-; SSE42-NEXT: movdqa %xmm2, (%rcx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: paddb 16(%rdx), %xmm3
+; SSE42-NEXT: paddb 32(%rdx), %xmm2
+; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: retq
;
@@ -4375,7 +4371,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
@@ -4492,20 +4488,16 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
+; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -4599,28 +4591,28 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 48(%rsi), %xmm1
+; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: paddb (%rdx), %xmm1
-; SSE2-NEXT: paddb 16(%rdx), %xmm0
+; SSE2-NEXT: paddb 16(%rdx), %xmm2
; SSE2-NEXT: movdqa %xmm1, (%rcx)
-; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 48(%rsi), %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: paddb (%rdx), %xmm1
-; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: paddb 16(%rdx), %xmm2
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -4750,22 +4742,18 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
-; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, (%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -4860,14 +4848,14 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 48(%rsi), %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT: paddb (%rdx), %xmm1
-; SSE2-NEXT: paddb 16(%rdx), %xmm0
-; SSE2-NEXT: movdqa %xmm1, (%rcx)
-; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT: paddb (%rdx), %xmm0
+; SSE2-NEXT: paddb 16(%rdx), %xmm2
+; SSE2-NEXT: movdqa %xmm0, (%rcx)
+; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
@@ -4876,12 +4864,12 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: paddb (%rdx), %xmm1
-; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: paddb 16(%rdx), %xmm2
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 3d72319f59ca9e..9572ff3a37fadd 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3129,8 +3129,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
@@ -3141,13 +3141,14 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
-; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -3233,17 +3234,13 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa (%rdi), %xmm3
-; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, (%rdx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -3519,16 +3516,16 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: paddb (%rsi), %xmm1
-; SSE42-NEXT: paddb 16(%rsi), %xmm0
-; SSE42-NEXT: paddb 32(%rsi), %xmm2
-; SSE42-NEXT: movdqa %xmm2, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
-; SSE42-NEXT: movdqa %xmm1, (%rdx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7]
+; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: paddb 16(%rsi), %xmm2
+; SSE42-NEXT: paddb 32(%rsi), %xmm1
+; SSE42-NEXT: movdqa %xmm1, 32(%rdx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
+; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
@@ -3537,8 +3534,8 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX-NEXT: vbroadcastss (%rdi), %xmm2
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
@@ -3549,10 +3546,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,0,1,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastd (%rdi), %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3634,19 +3631,15 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa (%rdi), %xmm2
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
+; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm0, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -3708,26 +3701,25 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movaps 48(%rdi), %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT: paddb 16(%rsi), %xmm0
+; SSE2-NEXT: paddb 16(%rsi), %xmm2
; SSE2-NEXT: paddb (%rsi), %xmm1
; SSE2-NEXT: movdqa %xmm1, (%rdx)
-; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
+; SSE2-NEXT: movdqa %xmm2, 16(%rdx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: paddb 16(%rsi), %xmm0
-; SSE42-NEXT: paddb (%rsi), %xmm1
-; SSE42-NEXT: movdqa %xmm1, (%rdx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; SSE42-NEXT: paddb 16(%rsi), %xmm1
+; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: movdqa %xmm0, (%rdx)
+; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -3820,19 +3812,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa (%rdi), %xmm2
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
+; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm0, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index 545c57fed4b2c3..a10e38512b26c2 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -69,9 +69,9 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract1_i32_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract1_i32_zext_insert0_i64_zero:
@@ -114,9 +114,9 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero:
@@ -375,7 +375,8 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero:
@@ -416,14 +417,14 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract1_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pextrw $1, %xmm0, %eax
+; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: extract1_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpextrw $1, %xmm0, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 1
%z = zext i16 %e to i64
@@ -452,14 +453,14 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract2_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
-; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pextrw $2, %xmm0, %eax
+; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: extract2_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpextrw $2, %xmm0, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 2
%z = zext i16 %e to i64
@@ -486,14 +487,14 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract3_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pextrw $3, %xmm0, %eax
+; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: extract3_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpextrw $3, %xmm0, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 3
%z = zext i16 %e to i64
diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll
index 3da53a4ca5f1b9..435ea61412b73e 100644
--- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll
+++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll
@@ -31,16 +31,18 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli
; AVX-32: # %bb.0: # %L.entry
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
+; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0
+; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
; AVX-32-NEXT: vmovups %ymm0, 608(%eax)
; AVX-32-NEXT: vzeroupper
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: PR15298:
; AVX-64: # %bb.0: # %L.entry
-; AVX-64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
+; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0
+; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
; AVX-64-NEXT: vmovups %ymm0, 608(%rsi)
; AVX-64-NEXT: vzeroupper
; AVX-64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/movmsk-bittest.ll b/llvm/test/CodeGen/X86/movmsk-bittest.ll
index b67e70e71c3d57..0bde62f106ae60 100644
--- a/llvm/test/CodeGen/X86/movmsk-bittest.ll
+++ b/llvm/test/CodeGen/X86/movmsk-bittest.ll
@@ -219,14 +219,23 @@ define i32 @movmsk_sgt_v16i8_15(<16 x i8> %v, i32 %a, i32 %b) {
}
define i32 @movmsk_eq_v4i64_0(<4 x i64> %v, i32 %a, i32 %b) {
-; SSE-LABEL: movmsk_eq_v4i64_0:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %edi, %eax
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE-NEXT: movmskps %xmm0, %ecx
-; SSE-NEXT: testb $1, %cl
-; SSE-NEXT: cmovel %esi, %eax
-; SSE-NEXT: retq
+; SSE2-LABEL: movmsk_eq_v4i64_0:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movl %edi, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: movmskps %xmm0, %ecx
+; SSE2-NEXT: testb $1, %cl
+; SSE2-NEXT: cmovel %esi, %eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: movmsk_eq_v4i64_0:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movl %edi, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE41-NEXT: movmskps %xmm0, %ecx
+; SSE41-NEXT: testb $1, %cl
+; SSE41-NEXT: cmovel %esi, %eax
+; SSE41-NEXT: retq
;
; AVX-LABEL: movmsk_eq_v4i64_0:
; AVX: # %bb.0:
@@ -557,5 +566,3 @@ define i32 @movmsk_sgt_v32i8_31(<32 x i8> %v, i32 %a, i32 %b) {
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX1OR2: {{.*}}
-; SSE2: {{.*}}
-; SSE41: {{.*}}
diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
index 8a6c2f851a6d69..f76e7f64fbf096 100644
--- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
+++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
@@ -1,21 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck -check-prefixes=CHECK,X32 %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck -check-prefixes=CHECK,X64 %s
define <4 x i64> @autogen_SD88863() {
-; CHECK-LABEL: autogen_SD88863:
-; CHECK: # %bb.0: # %BB
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
-; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-NEXT: movb $1, %al
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: .LBB0_1: # %CF
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_1
-; CHECK-NEXT: # %bb.2: # %CF240
-; CHECK-NEXT: ret{{[l|q]}}
+; X32-LABEL: autogen_SD88863:
+; X32: # %bb.0: # %BB
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
+; X32-NEXT: movb $1, %al
+; X32-NEXT: .p2align 4
+; X32-NEXT: .LBB0_1: # %CF
+; X32-NEXT: # =>This Inner Loop Header: Depth=1
+; X32-NEXT: testb %al, %al
+; X32-NEXT: jne .LBB0_1
+; X32-NEXT: # %bb.2: # %CF240
+; X32-NEXT: retl
+;
+; X64-LABEL: autogen_SD88863:
+; X64: # %bb.0: # %BB
+; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: movb $1, %al
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB0_1: # %CF
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: testb %al, %al
+; X64-NEXT: jne .LBB0_1
+; X64-NEXT: # %bb.2: # %CF240
+; X64-NEXT: retq
BB:
%I26 = insertelement <4 x i64> undef, i64 undef, i32 2
br label %CF
@@ -29,3 +43,5 @@ CF:
CF240:
ret <4 x i64> %I68
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 2d7258a49f5d09..6443bd5cda55f6 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1233,31 +1233,47 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
}
define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
-; SSE-LABEL: i32_shuf_X00X:
-; SSE: ## %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9]
-; SSE-NEXT: pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00]
-; SSE-NEXT: ## xmm0 = xmm0[0,0,0,0]
-; SSE-NEXT: pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c]
-; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
-; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+; X86-SSE-LABEL: i32_shuf_X00X:
+; X86-SSE: ## %bb.0:
+; X86-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; X86-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A]
+; X86-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-SSE-NEXT: retl ## encoding: [0xc3]
;
-; AVX1-LABEL: i32_shuf_X00X:
-; AVX1: ## %bb.0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
-; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
-; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+; X86-AVX1-LABEL: i32_shuf_X00X:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; X86-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
+; X86-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX1-NEXT: retl ## encoding: [0xc3]
;
-; AVX512-LABEL: i32_shuf_X00X:
-; AVX512: ## %bb.0:
-; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
-; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
-; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+; X86-AVX512-LABEL: i32_shuf_X00X:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
+; X86-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512-NEXT: retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: i32_shuf_X00X:
+; X64-SSE: ## %bb.0:
+; X64-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; X64-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A]
+; X64-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-SSE-NEXT: retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: i32_shuf_X00X:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; X64-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
+; X64-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX1-NEXT: retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: i32_shuf_X00X:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
+; X64-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -1269,32 +1285,26 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
; SSE-LABEL: i32_shuf_X0YC:
; SSE: ## %bb.0:
-; SSE-NEXT: pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0]
-; SSE-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE-NEXT: pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa]
-; SSE-NEXT: ## xmm0 = xmm1[2,2,2,2]
-; SSE-NEXT: pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f]
-; SSE-NEXT: ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
+; SSE-NEXT: pmovzxdq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x35,0xc0]
+; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE-NEXT: insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0]
+; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2]
; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX1-LABEL: i32_shuf_X0YC:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0]
; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
-; AVX1-NEXT: ## xmm1 = xmm1[2,2,2,2]
-; AVX1-NEXT: vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0]
-; AVX1-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; AVX1-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
+; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: i32_shuf_X0YC:
; AVX512: ## %bb.0:
; AVX512-NEXT: vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0]
; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512-NEXT: vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
-; AVX512-NEXT: ## xmm1 = xmm1[2,2,2,2]
-; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
-; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
+; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -2124,14 +2134,14 @@ define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
; AVX1-LABEL: build_vector_to_shuffle_1:
; AVX1: ## %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
+; AVX1-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05]
; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: build_vector_to_shuffle_1:
; AVX512: ## %bb.0:
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
+; AVX512-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05]
; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%vecext = extractelement <4 x float> %A, i32 1
@@ -2152,14 +2162,14 @@ define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
; AVX1-LABEL: build_vector_to_shuffle_2:
; AVX1: ## %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
+; AVX1-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d]
; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: build_vector_to_shuffle_2:
; AVX512: ## %bb.0:
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
+; AVX512-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d]
; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%vecext = extractelement <4 x float> %A, i32 1
diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll
index 341a703a21bd5e..ff0a68eb5692cb 100644
--- a/llvm/test/CodeGen/X86/vec_extract-avx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll
@@ -126,9 +126,7 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) {
;
; X64-LABEL: legal_vzmovl_2i32_8i32:
; X64: # %bb.0:
-; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index 07d8a370a5f930..1dd015cc516f03 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -18,24 +18,24 @@
define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
; SSE-LABEL: store_i32_stride5_vf2:
; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
-; SSE-NEXT: movaps %xmm2, %xmm5
-; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE-NEXT: movaps %xmm0, %xmm6
-; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1]
+; SSE-NEXT: movdqa %xmm2, %xmm6
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: movq %xmm2, 32(%r9)
-; SSE-NEXT: movaps %xmm6, (%r9)
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; SSE-NEXT: movq %xmm5, 32(%r9)
+; SSE-NEXT: movdqa %xmm3, (%r9)
; SSE-NEXT: movaps %xmm0, 16(%r9)
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
index ad345213c1472c..6f1948d3bc2a53 100644
--- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
@@ -107,9 +107,11 @@ define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) {
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: psubd %xmm0, %xmm2
-; SSE-NEXT: psrld $16, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT: psrlq $16, %xmm2
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: packuswb %xmm2, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -117,9 +119,8 @@ define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) {
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
%sub = sub <2 x i32> <i32 0, i32 undef>, %x
%bc = bitcast <2 x i32> %sub to <8 x i8>
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index ce092f9d343fc6..2fc1e662d7a7f9 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -4115,24 +4115,24 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 48(%rsi), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: paddb (%rdx), %xmm2
-; SSE2-NEXT: paddb 16(%rdx), %xmm0
-; SSE2-NEXT: paddb 32(%rdx), %xmm1
-; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm2, (%rcx)
-; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pinsrw $2, %eax, %xmm3
+; SSE2-NEXT: pinsrw $4, %eax, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,0,65535]
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: paddb (%rdx), %xmm4
+; SSE2-NEXT: paddb 16(%rdx), %xmm2
+; SSE2-NEXT: paddb 32(%rdx), %xmm3
+; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm4, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -4141,19 +4141,19 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE42-NEXT: movd %xmm0, %eax
+; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pxor %xmm3, %xmm3
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm2
-; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: paddb 32(%rdx), %xmm1
-; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm2, (%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: pinsrw $2, %eax, %xmm3
+; SSE42-NEXT: pinsrw $4, %eax, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: paddb 16(%rdx), %xmm2
+; SSE42-NEXT: paddb 32(%rdx), %xmm3
+; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -4165,16 +4165,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, (%rcx)
; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -4183,12 +4183,13 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -4323,36 +4324,37 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: paddb (%rdx), %xmm2
-; SSE2-NEXT: movdqa 16(%rdx), %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: paddb 32(%rdx), %xmm0
-; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm2, (%rcx)
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: paddb (%rdx), %xmm3
+; SSE2-NEXT: movdqa 16(%rdx), %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
+; SSE2-NEXT: paddb 32(%rdx), %xmm2
+; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm3, (%rcx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 48(%rsi), %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm1
-; SSE42-NEXT: movdqa 16(%rdx), %xmm0
-; SSE42-NEXT: paddb %xmm2, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: movdqa 16(%rdx), %xmm1
+; SSE42-NEXT: paddb %xmm2, %xmm1
; SSE42-NEXT: paddb 32(%rdx), %xmm2
; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -4362,9 +4364,9 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
@@ -4461,36 +4463,36 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: movaps 32(%rdx), %xmm1
-; SSE2-NEXT: paddb (%rdx), %xmm2
-; SSE2-NEXT: paddb 16(%rdx), %xmm0
-; SSE2-NEXT: movaps %xmm1, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm2, (%rcx)
-; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pinsrw $4, %eax, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: movaps 32(%rdx), %xmm0
+; SSE2-NEXT: paddb (%rdx), %xmm3
+; SSE2-NEXT: paddb 16(%rdx), %xmm2
+; SSE2-NEXT: movaps %xmm0, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm3, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 48(%rsi), %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: movd %xmm0, %eax
; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7]
-; SSE42-NEXT: movaps 32(%rdx), %xmm0
-; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: pinsrw $4, %eax, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT: movaps 32(%rdx), %xmm1
+; SSE42-NEXT: paddb (%rdx), %xmm0
; SSE42-NEXT: paddb 16(%rdx), %xmm2
-; SSE42-NEXT: movaps %xmm0, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
+; SSE42-NEXT: movaps %xmm1, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
; SSE42-NEXT: retq
;
@@ -4800,14 +4802,14 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
; SSE2-NEXT: paddb (%rdx), %xmm0
; SSE2-NEXT: paddb 16(%rdx), %xmm2
-; SSE2-NEXT: paddb 32(%rdx), %xmm1
-; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT: paddb 32(%rdx), %xmm3
+; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
; SSE2-NEXT: movdqa %xmm0, (%rcx)
; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
; SSE2-NEXT: retq
@@ -4818,18 +4820,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
-; SSE42-NEXT: pxor %xmm1, %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
-; SSE42-NEXT: paddb (%rdx), %xmm2
-; SSE42-NEXT: paddb 16(%rdx), %xmm1
-; SSE42-NEXT: paddb 32(%rdx), %xmm0
-; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm2, (%rcx)
-; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: paddb 16(%rdx), %xmm2
+; SSE42-NEXT: paddb 32(%rdx), %xmm3
+; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
@@ -4841,10 +4843,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
@@ -4857,20 +4859,19 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
-; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
-; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm2
+; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
+; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6],ymm3[7]
+; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -4878,18 +4879,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
@@ -4897,18 +4898,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
+; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
@@ -4997,11 +4998,11 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 48(%rsi), %xmm1
-; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: paddb (%rdx), %xmm1
; SSE2-NEXT: movdqa 16(%rdx), %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm0
@@ -5015,18 +5016,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 48(%rsi), %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm1
-; SSE42-NEXT: movdqa 16(%rdx), %xmm0
-; SSE42-NEXT: paddb %xmm2, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: movdqa 16(%rdx), %xmm1
+; SSE42-NEXT: paddb %xmm2, %xmm1
; SSE42-NEXT: paddb 32(%rdx), %xmm2
; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -5036,9 +5037,9 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
@@ -5051,18 +5052,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -5150,36 +5151,36 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 48(%rsi), %xmm1
-; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
-; SSE2-NEXT: movaps 32(%rdx), %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT: movaps 32(%rdx), %xmm0
; SSE2-NEXT: paddb (%rdx), %xmm1
-; SSE2-NEXT: paddb 16(%rdx), %xmm0
-; SSE2-NEXT: movaps %xmm2, 32(%rcx)
+; SSE2-NEXT: paddb 16(%rdx), %xmm2
+; SSE2-NEXT: movaps %xmm0, 32(%rcx)
; SSE2-NEXT: movdqa %xmm1, (%rcx)
-; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 48(%rsi), %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
-; SSE42-NEXT: movaps 32(%rdx), %xmm2
-; SSE42-NEXT: paddb (%rdx), %xmm1
-; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: movaps %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT: movaps 32(%rdx), %xmm1
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: paddb 16(%rdx), %xmm2
+; SSE42-NEXT: movaps %xmm1, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -5283,17 +5284,17 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 48(%rsi), %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE2-NEXT: paddb (%rdx), %xmm1
-; SSE2-NEXT: movdqa 16(%rdx), %xmm2
-; SSE2-NEXT: paddb %xmm0, %xmm2
-; SSE2-NEXT: paddb 32(%rdx), %xmm0
-; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm1, (%rcx)
-; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT: paddb (%rdx), %xmm0
+; SSE2-NEXT: movdqa 16(%rdx), %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
+; SSE2-NEXT: paddb 32(%rdx), %xmm2
+; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm0, (%rcx)
+; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
@@ -5302,15 +5303,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE42-NEXT: paddb (%rdx), %xmm1
-; SSE42-NEXT: movdqa 16(%rdx), %xmm2
-; SSE42-NEXT: paddb %xmm0, %xmm2
-; SSE42-NEXT: paddb 32(%rdx), %xmm0
-; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: movdqa 16(%rdx), %xmm1
+; SSE42-NEXT: paddb %xmm2, %xmm1
+; SSE42-NEXT: paddb 32(%rdx), %xmm2
+; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
@@ -5320,8 +5321,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
@@ -5335,15 +5336,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = xmm0[0],zero
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
+; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -5429,8 +5430,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 48(%rsi), %xmm1
+; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; SSE2-NEXT: movaps 32(%rdx), %xmm2
@@ -5447,14 +5448,15 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE42-NEXT: movaps 32(%rdx), %xmm2
-; SSE42-NEXT: paddb (%rdx), %xmm1
-; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: movaps %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
+; SSE42-NEXT: movdqa %xmm0, %xmm2
+; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: movaps 32(%rdx), %xmm1
+; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: paddb 16(%rdx), %xmm2
+; SSE42-NEXT: movaps %xmm1, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm0, (%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index acedcf42639066..d874ceb3f47364 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3316,41 +3316,39 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: movdqa 48(%rdi), %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: paddb (%rsi), %xmm1
-; SSE2-NEXT: paddb 16(%rsi), %xmm0
+; SSE2-NEXT: movl (%rdi), %eax
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,0,65535]
+; SSE2-NEXT: movdqa 48(%rdi), %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pinsrw $2, %eax, %xmm2
+; SSE2-NEXT: pinsrw $4, %eax, %xmm1
+; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: paddb 16(%rsi), %xmm1
; SSE2-NEXT: paddb 32(%rsi), %xmm2
; SSE2-NEXT: movdqa %xmm2, 32(%rdx)
-; SSE2-NEXT: movdqa %xmm1, (%rdx)
-; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
+; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
+; SSE2-NEXT: movdqa %xmm0, (%rdx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],mem[1,2,3,4,5],xmm3[6],mem[7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7]
+; SSE42-NEXT: movl (%rdi), %eax
+; SSE42-NEXT: pxor %xmm0, %xmm0
+; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: pinsrw $2, %eax, %xmm1
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2,3,4,5],xmm2[6],mem[7]
+; SSE42-NEXT: pinsrw $4, %eax, %xmm0
; SSE42-NEXT: paddb 16(%rsi), %xmm0
-; SSE42-NEXT: paddb (%rsi), %xmm3
+; SSE42-NEXT: paddb (%rsi), %xmm2
; SSE42-NEXT: paddb 32(%rsi), %xmm1
; SSE42-NEXT: movdqa %xmm1, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm3, (%rdx)
+; SSE42-NEXT: movdqa %xmm2, (%rdx)
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
@@ -3359,66 +3357,35 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpinsrw $2, (%rdi), %xmm2, %xmm2
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: retq
;
-; AVX2-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm2
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
-;
-; AVX2-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
-; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm2
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
+; AVX2-NEXT: vpbroadcastw (%rdi), %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX512F: # %bb.0:
@@ -3481,14 +3448,15 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm1
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
; SSE2-NEXT: paddb %xmm0, %xmm2
; SSE2-NEXT: paddb 32(%rsi), %xmm0
; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
-; SSE2-NEXT: movdqa %xmm2, 16(%rdx)
; SSE2-NEXT: movdqa %xmm1, (%rdx)
+; SSE2-NEXT: movdqa %xmm2, 16(%rdx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -3527,7 +3495,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
@@ -3596,9 +3564,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pinsrw $4, %eax, %xmm0
; SSE2-NEXT: movaps 32(%rsi), %xmm2
; SSE2-NEXT: paddb (%rsi), %xmm1
; SSE2-NEXT: paddb 16(%rsi), %xmm0
@@ -3612,15 +3580,15 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7]
-; SSE42-NEXT: movaps 32(%rsi), %xmm0
-; SSE42-NEXT: paddb 16(%rsi), %xmm2
+; SSE42-NEXT: movd %xmm0, %eax
+; SSE42-NEXT: pxor %xmm0, %xmm0
+; SSE42-NEXT: pinsrw $4, %eax, %xmm0
+; SSE42-NEXT: movaps 32(%rsi), %xmm2
+; SSE42-NEXT: paddb 16(%rsi), %xmm0
; SSE42-NEXT: paddb (%rsi), %xmm1
-; SSE42-NEXT: movaps %xmm0, 32(%rdx)
+; SSE42-NEXT: movaps %xmm2, 32(%rdx)
; SSE42-NEXT: movdqa %xmm1, (%rdx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
@@ -3862,9 +3830,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
; SSE2-NEXT: paddb 16(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 32(%rsi), %xmm2
@@ -3879,9 +3847,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
; SSE42-NEXT: paddb 16(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 32(%rsi), %xmm2
@@ -3895,12 +3863,12 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vbroadcastss (%rdi), %ymm2
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX-NEXT: vbroadcastss (%rdi), %ymm3
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
@@ -3911,27 +3879,26 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
;
; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
-; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7]
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,5,6,0]
+; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6],ymm1[7]
+; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -3945,10 +3912,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
;
; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -4029,47 +3995,47 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: movdqa 48(%rdi), %xmm1
+; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE42-NEXT: movdqa 16(%rsi), %xmm0
-; SSE42-NEXT: paddb %xmm2, %xmm0
-; SSE42-NEXT: paddb (%rsi), %xmm1
-; SSE42-NEXT: paddb 32(%rsi), %xmm2
-; SSE42-NEXT: movdqa %xmm2, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm1, (%rdx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; SSE42-NEXT: movdqa 16(%rsi), %xmm2
+; SSE42-NEXT: paddb %xmm1, %xmm2
+; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: paddb 32(%rsi), %xmm1
+; SSE42-NEXT: movdqa %xmm1, 32(%rdx)
+; SSE42-NEXT: movdqa %xmm0, (%rdx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
-; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -4142,17 +4108,16 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: movdqa 48(%rdi), %xmm1
+; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
; SSE42-NEXT: movaps 32(%rsi), %xmm2
-; SSE42-NEXT: paddb 16(%rsi), %xmm0
-; SSE42-NEXT: paddb (%rsi), %xmm1
+; SSE42-NEXT: paddb 16(%rsi), %xmm1
+; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: movaps %xmm2, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm1, (%rdx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
+; SSE42-NEXT: movdqa %xmm0, (%rdx)
+; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -4233,61 +4198,57 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; SSE2: # %bb.0:
-; SSE2-NEXT: movapd (%rdi), %xmm0
-; SSE2-NEXT: movapd 48(%rdi), %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
+; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
-; SSE2-NEXT: paddb %xmm0, %xmm2
-; SSE2-NEXT: paddb (%rsi), %xmm1
-; SSE2-NEXT: paddb 32(%rsi), %xmm0
-; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
-; SSE2-NEXT: movdqa %xmm1, (%rdx)
+; SSE2-NEXT: paddb %xmm1, %xmm2
+; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: paddb 32(%rsi), %xmm1
+; SSE2-NEXT: movdqa %xmm1, 32(%rdx)
+; SSE2-NEXT: movdqa %xmm0, (%rdx)
; SSE2-NEXT: movdqa %xmm2, 16(%rdx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE42-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
; SSE42-NEXT: movdqa 16(%rsi), %xmm2
-; SSE42-NEXT: paddb %xmm0, %xmm2
-; SSE42-NEXT: paddb (%rsi), %xmm1
-; SSE42-NEXT: paddb 32(%rsi), %xmm0
-; SSE42-NEXT: movdqa %xmm0, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm1, (%rdx)
+; SSE42-NEXT: paddb %xmm1, %xmm2
+; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: paddb 32(%rsi), %xmm1
+; SSE42-NEXT: movdqa %xmm1, 32(%rdx)
+; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
-; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
>From 7cd9367b4f9a23c7603fed68d3c5f4f085561eaa Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 16 Jan 2025 14:29:15 +0700
Subject: [PATCH 4/5] Avoid arm regressions
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +-
.../any_extend_vector_inreg_of_broadcast.ll | 240 +++++-----
...d_vector_inreg_of_broadcast_from_memory.ll | 118 ++---
llvm/test/CodeGen/X86/buildvec-extract.ll | 39 +-
llvm/test/CodeGen/X86/movmsk-bittest.ll | 27 +-
.../CodeGen/X86/split-extend-vector-inreg.ll | 46 +-
llvm/test/CodeGen/X86/sse41.ll | 92 ++--
llvm/test/CodeGen/X86/vec_extract-avx.ll | 4 +-
.../vector-interleaved-store-i32-stride-5.ll | 26 +-
llvm/test/CodeGen/X86/vector-narrow-binop.ll | 9 +-
.../zero_extend_vector_inreg_of_broadcast.ll | 420 +++++++++---------
...d_vector_inreg_of_broadcast_from_memory.ll | 325 ++++++++------
12 files changed, 688 insertions(+), 662 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d5d8336cc2c47c..8cfd994a21d56c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23885,9 +23885,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
TLI.isTypeLegal(VT.getVectorElementType()) &&
// VecIn[1].hasOneUse() &&
- NumExtracts == 1
- //&& TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
- )
+ NumExtracts == 1 && TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
return SDValue();
unsigned MaxIndex = 0;
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 940c1ca2d4d355..cad1d09f11d9c3 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -3817,21 +3817,21 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: paddb (%rdx), %xmm4
-; SSE2-NEXT: paddb 16(%rdx), %xmm3
-; SSE2-NEXT: paddb 32(%rdx), %xmm2
-; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm4, (%rcx)
+; SSE2-NEXT: paddb 48(%rsi), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: paddb (%rdx), %xmm2
+; SSE2-NEXT: paddb 16(%rdx), %xmm0
+; SSE2-NEXT: paddb 32(%rdx), %xmm1
+; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm2, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -3840,16 +3840,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: paddb 16(%rdx), %xmm3
-; SSE42-NEXT: paddb 32(%rdx), %xmm2
-; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: paddb (%rdx), %xmm2
+; SSE42-NEXT: paddb 16(%rdx), %xmm0
+; SSE42-NEXT: paddb 32(%rdx), %xmm1
+; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm2, (%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -3858,15 +3858,15 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: retq
;
@@ -3875,7 +3875,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
@@ -3985,16 +3985,20 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
+; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -4075,29 +4079,29 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: paddb (%rdx), %xmm3
-; SSE2-NEXT: paddb 16(%rdx), %xmm2
-; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm3, (%rcx)
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: paddb (%rdx), %xmm2
+; SSE2-NEXT: paddb 16(%rdx), %xmm0
+; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm2, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: paddb 16(%rdx), %xmm2
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT: paddb 48(%rsi), %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: paddb 16(%rdx), %xmm0
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
@@ -4335,33 +4339,33 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: paddb 16(%rdx), %xmm3
-; SSE42-NEXT: paddb 32(%rdx), %xmm2
-; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: paddb (%rdx), %xmm2
+; SSE42-NEXT: paddb 16(%rdx), %xmm0
+; SSE42-NEXT: paddb 32(%rdx), %xmm1
+; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm2, (%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: retq
;
@@ -4371,7 +4375,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
@@ -4488,16 +4492,20 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
+; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -4591,28 +4599,28 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: paddb (%rdx), %xmm1
-; SSE2-NEXT: paddb 16(%rdx), %xmm2
+; SSE2-NEXT: paddb 16(%rdx), %xmm0
; SSE2-NEXT: movdqa %xmm1, (%rcx)
-; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: paddb 16(%rdx), %xmm2
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT: paddb 48(%rsi), %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: paddb 16(%rdx), %xmm0
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -4742,18 +4750,22 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -4848,14 +4860,14 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; SSE2-NEXT: paddb (%rdx), %xmm0
-; SSE2-NEXT: paddb 16(%rdx), %xmm2
-; SSE2-NEXT: movdqa %xmm0, (%rcx)
-; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT: paddb 48(%rsi), %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: paddb (%rdx), %xmm1
+; SSE2-NEXT: paddb 16(%rdx), %xmm0
+; SSE2-NEXT: movdqa %xmm1, (%rcx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
@@ -4864,12 +4876,12 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: paddb 16(%rdx), %xmm2
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: paddb 16(%rdx), %xmm0
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 9572ff3a37fadd..3d72319f59ca9e 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3129,8 +3129,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
@@ -3141,14 +3141,13 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
-; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -3234,13 +3233,17 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2
+; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa (%rdi), %xmm3
+; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX-NEXT: vmovdqa %xmm2, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -3516,16 +3519,16 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7]
-; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: paddb 16(%rsi), %xmm2
-; SSE42-NEXT: paddb 32(%rsi), %xmm1
-; SSE42-NEXT: movdqa %xmm1, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
-; SSE42-NEXT: movdqa %xmm0, (%rdx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: paddb (%rsi), %xmm1
+; SSE42-NEXT: paddb 16(%rsi), %xmm0
+; SSE42-NEXT: paddb 32(%rsi), %xmm2
+; SSE42-NEXT: movdqa %xmm2, 32(%rdx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
+; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
@@ -3534,8 +3537,8 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX-NEXT: vbroadcastss (%rdi), %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
@@ -3546,10 +3549,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,0,1,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastd (%rdi), %ymm1
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3631,15 +3634,19 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa (%rdi), %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa (%rdi), %xmm2
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -3701,25 +3708,26 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 48(%rdi), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE2-NEXT: paddb 16(%rsi), %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: paddb 16(%rsi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm1
; SSE2-NEXT: movdqa %xmm1, (%rdx)
-; SSE2-NEXT: movdqa %xmm2, 16(%rdx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
-; SSE42-NEXT: paddb 16(%rsi), %xmm1
-; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: movdqa %xmm0, (%rdx)
-; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
+; SSE42-NEXT: movdqa 48(%rdi), %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: paddb 16(%rsi), %xmm0
+; SSE42-NEXT: paddb (%rsi), %xmm1
+; SSE42-NEXT: movdqa %xmm1, (%rdx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -3812,15 +3820,19 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa (%rdi), %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa (%rdi), %xmm2
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index a10e38512b26c2..545c57fed4b2c3 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -69,9 +69,9 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract1_i32_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract1_i32_zext_insert0_i64_zero:
@@ -114,9 +114,9 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero:
@@ -375,8 +375,7 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: pextrw $0, %xmm0, %eax
-; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero:
@@ -417,14 +416,14 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract1_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract1_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 1
%z = zext i16 %e to i64
@@ -453,14 +452,14 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract2_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract2_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 2
%z = zext i16 %e to i64
@@ -487,14 +486,14 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract3_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract3_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 3
%z = zext i16 %e to i64
diff --git a/llvm/test/CodeGen/X86/movmsk-bittest.ll b/llvm/test/CodeGen/X86/movmsk-bittest.ll
index 0bde62f106ae60..b67e70e71c3d57 100644
--- a/llvm/test/CodeGen/X86/movmsk-bittest.ll
+++ b/llvm/test/CodeGen/X86/movmsk-bittest.ll
@@ -219,23 +219,14 @@ define i32 @movmsk_sgt_v16i8_15(<16 x i8> %v, i32 %a, i32 %b) {
}
define i32 @movmsk_eq_v4i64_0(<4 x i64> %v, i32 %a, i32 %b) {
-; SSE2-LABEL: movmsk_eq_v4i64_0:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movl %edi, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: movmskps %xmm0, %ecx
-; SSE2-NEXT: testb $1, %cl
-; SSE2-NEXT: cmovel %esi, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: movmsk_eq_v4i64_0:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movl %edi, %eax
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE41-NEXT: movmskps %xmm0, %ecx
-; SSE41-NEXT: testb $1, %cl
-; SSE41-NEXT: cmovel %esi, %eax
-; SSE41-NEXT: retq
+; SSE-LABEL: movmsk_eq_v4i64_0:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT: movmskps %xmm0, %ecx
+; SSE-NEXT: testb $1, %cl
+; SSE-NEXT: cmovel %esi, %eax
+; SSE-NEXT: retq
;
; AVX-LABEL: movmsk_eq_v4i64_0:
; AVX: # %bb.0:
@@ -566,3 +557,5 @@ define i32 @movmsk_sgt_v32i8_31(<32 x i8> %v, i32 %a, i32 %b) {
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX1OR2: {{.*}}
+; SSE2: {{.*}}
+; SSE41: {{.*}}
diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
index f76e7f64fbf096..8a6c2f851a6d69 100644
--- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
+++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
@@ -1,35 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck -check-prefixes=CHECK,X32 %s
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck -check-prefixes=CHECK,X64 %s
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s
define <4 x i64> @autogen_SD88863() {
-; X32-LABEL: autogen_SD88863:
-; X32: # %bb.0: # %BB
-; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
-; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
-; X32-NEXT: movb $1, %al
-; X32-NEXT: .p2align 4
-; X32-NEXT: .LBB0_1: # %CF
-; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: testb %al, %al
-; X32-NEXT: jne .LBB0_1
-; X32-NEXT: # %bb.2: # %CF240
-; X32-NEXT: retl
-;
-; X64-LABEL: autogen_SD88863:
-; X64: # %bb.0: # %BB
-; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-NEXT: movb $1, %al
-; X64-NEXT: .p2align 4
-; X64-NEXT: .LBB0_1: # %CF
-; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: testb %al, %al
-; X64-NEXT: jne .LBB0_1
-; X64-NEXT: # %bb.2: # %CF240
-; X64-NEXT: retq
+; CHECK-LABEL: autogen_SD88863:
+; CHECK: # %bb.0: # %BB
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: movb $1, %al
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB0_1: # %CF
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB0_1
+; CHECK-NEXT: # %bb.2: # %CF240
+; CHECK-NEXT: ret{{[l|q]}}
BB:
%I26 = insertelement <4 x i64> undef, i64 undef, i32 2
br label %CF
@@ -43,5 +29,3 @@ CF:
CF240:
ret <4 x i64> %I68
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 6443bd5cda55f6..2d7258a49f5d09 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1233,47 +1233,31 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
}
define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
-; X86-SSE-LABEL: i32_shuf_X00X:
-; X86-SSE: ## %bb.0:
-; X86-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; X86-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A]
-; X86-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-SSE-NEXT: retl ## encoding: [0xc3]
-;
-; X86-AVX1-LABEL: i32_shuf_X00X:
-; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; X86-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
-; X86-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX1-NEXT: retl ## encoding: [0xc3]
-;
-; X86-AVX512-LABEL: i32_shuf_X00X:
-; X86-AVX512: ## %bb.0:
-; X86-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
-; X86-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512-NEXT: retl ## encoding: [0xc3]
-;
-; X64-SSE-LABEL: i32_shuf_X00X:
-; X64-SSE: ## %bb.0:
-; X64-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; X64-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A]
-; X64-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; X64-SSE-NEXT: retq ## encoding: [0xc3]
+; SSE-LABEL: i32_shuf_X00X:
+; SSE: ## %bb.0:
+; SSE-NEXT: pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9]
+; SSE-NEXT: pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00]
+; SSE-NEXT: ## xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c]
+; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
-; X64-AVX1-LABEL: i32_shuf_X00X:
-; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; X64-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
-; X64-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; X64-AVX1-NEXT: retq ## encoding: [0xc3]
+; AVX1-LABEL: i32_shuf_X00X:
+; AVX1: ## %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
+; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
+; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
+; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
-; X64-AVX512-LABEL: i32_shuf_X00X:
-; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
-; X64-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; X64-AVX512-NEXT: retq ## encoding: [0xc3]
+; AVX512-LABEL: i32_shuf_X00X:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
+; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
+; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
+; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -1285,26 +1269,32 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
; SSE-LABEL: i32_shuf_X0YC:
; SSE: ## %bb.0:
-; SSE-NEXT: pmovzxdq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x35,0xc0]
-; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE-NEXT: insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0]
-; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2]
+; SSE-NEXT: pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0]
+; SSE-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE-NEXT: pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa]
+; SSE-NEXT: ## xmm0 = xmm1[2,2,2,2]
+; SSE-NEXT: pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f]
+; SSE-NEXT: ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX1-LABEL: i32_shuf_X0YC:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0]
; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
-; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2]
+; AVX1-NEXT: vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
+; AVX1-NEXT: ## xmm1 = xmm1[2,2,2,2]
+; AVX1-NEXT: vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0]
+; AVX1-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: i32_shuf_X0YC:
; AVX512: ## %bb.0:
; AVX512-NEXT: vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0]
; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
-; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2]
+; AVX512-NEXT: vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
+; AVX512-NEXT: ## xmm1 = xmm1[2,2,2,2]
+; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
+; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -2134,14 +2124,14 @@ define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
; AVX1-LABEL: build_vector_to_shuffle_1:
; AVX1: ## %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05]
+; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: build_vector_to_shuffle_1:
; AVX512: ## %bb.0:
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05]
+; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%vecext = extractelement <4 x float> %A, i32 1
@@ -2162,14 +2152,14 @@ define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
; AVX1-LABEL: build_vector_to_shuffle_2:
; AVX1: ## %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d]
+; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: build_vector_to_shuffle_2:
; AVX512: ## %bb.0:
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d]
+; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%vecext = extractelement <4 x float> %A, i32 1
diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll
index ff0a68eb5692cb..341a703a21bd5e 100644
--- a/llvm/test/CodeGen/X86/vec_extract-avx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll
@@ -126,7 +126,9 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) {
;
; X64-LABEL: legal_vzmovl_2i32_8i32:
; X64: # %bb.0:
-; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index 1dd015cc516f03..07d8a370a5f930 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -18,24 +18,24 @@
define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
; SSE-LABEL: store_i32_stride5_vf2:
; SSE: # %bb.0:
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1]
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; SSE-NEXT: movaps %xmm2, %xmm5
+; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE-NEXT: movaps %xmm0, %xmm6
+; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; SSE-NEXT: movq %xmm5, 32(%r9)
-; SSE-NEXT: movdqa %xmm3, (%r9)
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT: movq %xmm2, 32(%r9)
+; SSE-NEXT: movaps %xmm6, (%r9)
; SSE-NEXT: movaps %xmm0, 16(%r9)
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
index 6f1948d3bc2a53..ad345213c1472c 100644
--- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
@@ -107,11 +107,9 @@ define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) {
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: psubd %xmm0, %xmm2
+; SSE-NEXT: psrld $16, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT: psrlq $16, %xmm2
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: packuswb %xmm2, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -119,8 +117,9 @@ define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) {
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
%sub = sub <2 x i32> <i32 0, i32 undef>, %x
%bc = bitcast <2 x i32> %sub to <8 x i8>
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 2fc1e662d7a7f9..ce092f9d343fc6 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -4115,24 +4115,24 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pinsrw $2, %eax, %xmm3
-; SSE2-NEXT: pinsrw $4, %eax, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: paddb (%rdx), %xmm4
-; SSE2-NEXT: paddb 16(%rdx), %xmm2
-; SSE2-NEXT: paddb 32(%rdx), %xmm3
-; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm4, (%rcx)
+; SSE2-NEXT: paddb 48(%rsi), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT: paddb (%rdx), %xmm2
+; SSE2-NEXT: paddb 16(%rdx), %xmm0
+; SSE2-NEXT: paddb 32(%rdx), %xmm1
+; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm2, (%rcx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -4141,19 +4141,19 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: movd %xmm0, %eax
-; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; SSE42-NEXT: pxor %xmm3, %xmm3
-; SSE42-NEXT: pinsrw $2, %eax, %xmm3
-; SSE42-NEXT: pinsrw $4, %eax, %xmm2
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: paddb 16(%rdx), %xmm2
-; SSE42-NEXT: paddb 32(%rdx), %xmm3
-; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7]
+; SSE42-NEXT: paddb (%rdx), %xmm2
+; SSE42-NEXT: paddb 16(%rdx), %xmm0
+; SSE42-NEXT: paddb 32(%rdx), %xmm1
+; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm2, (%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -4165,16 +4165,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -4183,13 +4183,12 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -4324,37 +4323,36 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: pextrw $0, %xmm0, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: paddb (%rdx), %xmm3
-; SSE2-NEXT: movdqa 16(%rdx), %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm0
-; SSE2-NEXT: paddb 32(%rdx), %xmm2
-; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm3, (%rcx)
-; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb (%rdx), %xmm2
+; SSE2-NEXT: movdqa 16(%rdx), %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: paddb 32(%rdx), %xmm0
+; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm2, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: paddb 48(%rsi), %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: movdqa 16(%rdx), %xmm1
-; SSE42-NEXT: paddb %xmm2, %xmm1
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: movdqa 16(%rdx), %xmm0
+; SSE42-NEXT: paddb %xmm2, %xmm0
; SSE42-NEXT: paddb 32(%rdx), %xmm2
; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
-; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -4364,9 +4362,9 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
@@ -4463,36 +4461,36 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pinsrw $4, %eax, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movaps 32(%rdx), %xmm0
-; SSE2-NEXT: paddb (%rdx), %xmm3
-; SSE2-NEXT: paddb 16(%rdx), %xmm2
-; SSE2-NEXT: movaps %xmm0, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm3, (%rcx)
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT: movaps 32(%rdx), %xmm1
+; SSE2-NEXT: paddb (%rdx), %xmm2
+; SSE2-NEXT: paddb 16(%rdx), %xmm0
+; SSE2-NEXT: movaps %xmm1, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm2, (%rcx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: movd %xmm0, %eax
+; SSE42-NEXT: paddb 48(%rsi), %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pinsrw $4, %eax, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE42-NEXT: movaps 32(%rdx), %xmm1
-; SSE42-NEXT: paddb (%rdx), %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7]
+; SSE42-NEXT: movaps 32(%rdx), %xmm0
+; SSE42-NEXT: paddb (%rdx), %xmm1
; SSE42-NEXT: paddb 16(%rdx), %xmm2
-; SSE42-NEXT: movaps %xmm1, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: movaps %xmm0, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
; SSE42-NEXT: retq
;
@@ -4802,14 +4800,14 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
; SSE2-NEXT: paddb (%rdx), %xmm0
; SSE2-NEXT: paddb 16(%rdx), %xmm2
-; SSE2-NEXT: paddb 32(%rdx), %xmm3
-; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT: paddb 32(%rdx), %xmm1
+; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
; SSE2-NEXT: movdqa %xmm0, (%rcx)
; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
; SSE2-NEXT: retq
@@ -4820,18 +4818,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: paddb 16(%rdx), %xmm2
-; SSE42-NEXT: paddb 32(%rdx), %xmm3
-; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
+; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; SSE42-NEXT: paddb (%rdx), %xmm2
+; SSE42-NEXT: paddb 16(%rdx), %xmm1
+; SSE42-NEXT: paddb 32(%rdx), %xmm0
+; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm2, (%rcx)
+; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
@@ -4843,10 +4841,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
@@ -4859,19 +4857,20 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm2
-; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
-; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6],ymm3[7]
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
+; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -4879,18 +4878,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
@@ -4898,18 +4897,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
@@ -4998,11 +4997,11 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: paddb 48(%rsi), %xmm1
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: paddb (%rdx), %xmm1
; SSE2-NEXT: movdqa 16(%rdx), %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm0
@@ -5016,18 +5015,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: paddb 48(%rsi), %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: movdqa 16(%rdx), %xmm1
-; SSE42-NEXT: paddb %xmm2, %xmm1
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: movdqa 16(%rdx), %xmm0
+; SSE42-NEXT: paddb %xmm2, %xmm0
; SSE42-NEXT: paddb 32(%rdx), %xmm2
; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
-; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -5037,9 +5036,9 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
@@ -5052,18 +5051,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -5151,36 +5150,36 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: paddb 48(%rsi), %xmm1
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
-; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE2-NEXT: movaps 32(%rdx), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
+; SSE2-NEXT: movaps 32(%rdx), %xmm2
; SSE2-NEXT: paddb (%rdx), %xmm1
-; SSE2-NEXT: paddb 16(%rdx), %xmm2
-; SSE2-NEXT: movaps %xmm0, 32(%rcx)
+; SSE2-NEXT: paddb 16(%rdx), %xmm0
+; SSE2-NEXT: movaps %xmm2, 32(%rcx)
; SSE2-NEXT: movdqa %xmm1, (%rcx)
-; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
-; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: paddb 48(%rsi), %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: movaps 32(%rdx), %xmm1
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: paddb 16(%rdx), %xmm2
-; SSE42-NEXT: movaps %xmm1, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
+; SSE42-NEXT: movaps 32(%rdx), %xmm2
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: paddb 16(%rdx), %xmm0
+; SSE42-NEXT: movaps %xmm2, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -5284,17 +5283,17 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; SSE2-NEXT: paddb (%rdx), %xmm0
-; SSE2-NEXT: movdqa 16(%rdx), %xmm1
-; SSE2-NEXT: paddb %xmm2, %xmm1
-; SSE2-NEXT: paddb 32(%rdx), %xmm2
-; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm0, (%rcx)
-; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
+; SSE2-NEXT: paddb 48(%rsi), %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE2-NEXT: paddb (%rdx), %xmm1
+; SSE2-NEXT: movdqa 16(%rdx), %xmm2
+; SSE2-NEXT: paddb %xmm0, %xmm2
+; SSE2-NEXT: paddb 32(%rdx), %xmm0
+; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm1, (%rcx)
+; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
@@ -5303,15 +5302,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: movdqa 16(%rdx), %xmm1
-; SSE42-NEXT: paddb %xmm2, %xmm1
-; SSE42-NEXT: paddb 32(%rdx), %xmm2
-; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
-; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: movdqa 16(%rdx), %xmm2
+; SSE42-NEXT: paddb %xmm0, %xmm2
+; SSE42-NEXT: paddb 32(%rdx), %xmm0
+; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
@@ -5321,8 +5320,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
@@ -5336,15 +5335,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovq {{.*#+}} xmm2 = xmm0[0],zero
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -5430,8 +5429,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; SSE2-NEXT: movaps 32(%rdx), %xmm2
@@ -5448,15 +5447,14 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE42-NEXT: movaps 32(%rdx), %xmm1
-; SSE42-NEXT: paddb (%rdx), %xmm0
-; SSE42-NEXT: paddb 16(%rdx), %xmm2
-; SSE42-NEXT: movaps %xmm1, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
-; SSE42-NEXT: movdqa %xmm0, (%rcx)
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE42-NEXT: movaps 32(%rdx), %xmm2
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: paddb 16(%rdx), %xmm0
+; SSE42-NEXT: movaps %xmm2, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index d874ceb3f47364..acedcf42639066 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3316,39 +3316,41 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; SSE2: # %bb.0:
-; SSE2-NEXT: movl (%rdi), %eax
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pinsrw $2, %eax, %xmm2
-; SSE2-NEXT: pinsrw $4, %eax, %xmm1
-; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: paddb 16(%rsi), %xmm1
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,0,65535]
+; SSE2-NEXT: movdqa 48(%rdi), %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT: paddb (%rsi), %xmm1
+; SSE2-NEXT: paddb 16(%rsi), %xmm0
; SSE2-NEXT: paddb 32(%rsi), %xmm2
; SSE2-NEXT: movdqa %xmm2, 32(%rdx)
-; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
-; SSE2-NEXT: movdqa %xmm0, (%rdx)
+; SSE2-NEXT: movdqa %xmm1, (%rdx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; SSE42: # %bb.0:
-; SSE42-NEXT: movl (%rdi), %eax
-; SSE42-NEXT: pxor %xmm0, %xmm0
-; SSE42-NEXT: pxor %xmm1, %xmm1
-; SSE42-NEXT: pinsrw $2, %eax, %xmm1
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2,3,4,5],xmm2[6],mem[7]
-; SSE42-NEXT: pinsrw $4, %eax, %xmm0
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],mem[1,2,3,4,5],xmm3[6],mem[7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7]
; SSE42-NEXT: paddb 16(%rsi), %xmm0
-; SSE42-NEXT: paddb (%rsi), %xmm2
+; SSE42-NEXT: paddb (%rsi), %xmm3
; SSE42-NEXT: paddb 32(%rsi), %xmm1
; SSE42-NEXT: movdqa %xmm1, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm2, (%rdx)
+; SSE42-NEXT: movdqa %xmm3, (%rdx)
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
@@ -3357,35 +3359,66 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpinsrw $2, (%rdi), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: retq
;
-; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX2-NEXT: vpbroadcastw (%rdi), %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
+; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
+; AVX2-FAST-PERLANE: # %bb.0:
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm2
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-FAST-PERLANE-NEXT: vzeroupper
+; AVX2-FAST-PERLANE-NEXT: retq
+;
+; AVX2-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
+; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm2
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX512F: # %bb.0:
@@ -3448,15 +3481,14 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pextrw $0, %xmm0, %eax
-; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm1
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
; SSE2-NEXT: paddb %xmm0, %xmm2
; SSE2-NEXT: paddb 32(%rsi), %xmm0
; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
-; SSE2-NEXT: movdqa %xmm1, (%rdx)
; SSE2-NEXT: movdqa %xmm2, 16(%rdx)
+; SSE2-NEXT: movdqa %xmm1, (%rdx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -3495,7 +3527,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
@@ -3564,9 +3596,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pinsrw $4, %eax, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; SSE2-NEXT: movaps 32(%rsi), %xmm2
; SSE2-NEXT: paddb (%rsi), %xmm1
; SSE2-NEXT: paddb 16(%rsi), %xmm0
@@ -3580,15 +3612,15 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE42-NEXT: movd %xmm0, %eax
-; SSE42-NEXT: pxor %xmm0, %xmm0
-; SSE42-NEXT: pinsrw $4, %eax, %xmm0
-; SSE42-NEXT: movaps 32(%rsi), %xmm2
-; SSE42-NEXT: paddb 16(%rsi), %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7]
+; SSE42-NEXT: movaps 32(%rsi), %xmm0
+; SSE42-NEXT: paddb 16(%rsi), %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm1
-; SSE42-NEXT: movaps %xmm2, 32(%rdx)
+; SSE42-NEXT: movaps %xmm0, 32(%rdx)
; SSE42-NEXT: movdqa %xmm1, (%rdx)
-; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
+; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
@@ -3830,9 +3862,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
; SSE2-NEXT: paddb 16(%rsi), %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: paddb 32(%rsi), %xmm2
@@ -3847,9 +3879,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
; SSE42-NEXT: paddb 16(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 32(%rsi), %xmm2
@@ -3863,12 +3895,12 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX-NEXT: vbroadcastss (%rdi), %ymm2
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX-NEXT: vbroadcastss (%rdi), %ymm3
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
@@ -3879,26 +3911,27 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
;
; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,5,6,0]
-; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm3, %ymm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6],ymm1[7]
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
+; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -3912,9 +3945,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
;
; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -3995,47 +4029,47 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
-; SSE42-NEXT: movdqa 16(%rsi), %xmm2
-; SSE42-NEXT: paddb %xmm1, %xmm2
-; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: paddb 32(%rsi), %xmm1
-; SSE42-NEXT: movdqa %xmm1, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm0, (%rdx)
-; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT: movdqa 16(%rsi), %xmm0
+; SSE42-NEXT: paddb %xmm2, %xmm0
+; SSE42-NEXT: paddb (%rsi), %xmm1
+; SSE42-NEXT: paddb 32(%rsi), %xmm2
+; SSE42-NEXT: movdqa %xmm2, 32(%rdx)
+; SSE42-NEXT: movdqa %xmm1, (%rdx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -4108,16 +4142,17 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
; SSE42-NEXT: movaps 32(%rsi), %xmm2
-; SSE42-NEXT: paddb 16(%rsi), %xmm1
-; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: paddb 16(%rsi), %xmm0
+; SSE42-NEXT: paddb (%rsi), %xmm1
; SSE42-NEXT: movaps %xmm2, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm0, (%rdx)
-; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
+; SSE42-NEXT: movdqa %xmm1, (%rdx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -4198,57 +4233,61 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
-; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; SSE2-NEXT: movapd (%rdi), %xmm0
+; SSE2-NEXT: movapd 48(%rdi), %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
-; SSE2-NEXT: paddb %xmm1, %xmm2
-; SSE2-NEXT: paddb (%rsi), %xmm0
-; SSE2-NEXT: paddb 32(%rsi), %xmm1
-; SSE2-NEXT: movdqa %xmm1, 32(%rdx)
-; SSE2-NEXT: movdqa %xmm0, (%rdx)
+; SSE2-NEXT: paddb %xmm0, %xmm2
+; SSE2-NEXT: paddb (%rsi), %xmm1
+; SSE2-NEXT: paddb 32(%rsi), %xmm0
+; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
+; SSE2-NEXT: movdqa %xmm1, (%rdx)
; SSE2-NEXT: movdqa %xmm2, 16(%rdx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
+; SSE42-NEXT: movdqa 48(%rdi), %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE42-NEXT: movdqa 16(%rsi), %xmm2
-; SSE42-NEXT: paddb %xmm1, %xmm2
-; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: paddb 32(%rsi), %xmm1
-; SSE42-NEXT: movdqa %xmm1, 32(%rdx)
-; SSE42-NEXT: movdqa %xmm0, (%rdx)
+; SSE42-NEXT: paddb %xmm0, %xmm2
+; SSE42-NEXT: paddb (%rsi), %xmm1
+; SSE42-NEXT: paddb 32(%rsi), %xmm0
+; SSE42-NEXT: movdqa %xmm0, 32(%rdx)
+; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3]
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
>From 0f588c812c2c551cde646963cd926c5047e9a534 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 16 Jan 2025 14:54:00 +0700
Subject: [PATCH 5/5] Cleanup and comment
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 21 ++++++++++---------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8cfd994a21d56c..de7fb21f5903e3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23811,6 +23811,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// value.
unsigned OneConstExtractIndex = ~0u;
+ // Count the number of extract_vector_elt sources (i.e. non-constant or undef)
unsigned NumExtracts = 0;
for (unsigned i = 0; i != NumElems; ++i) {
@@ -23849,9 +23850,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
ExtractedFromVec.getValueType().getVectorElementType())
return SDValue();
- if (OneConstExtractIndex == ~0u)
- OneConstExtractIndex = ExtractIdx->getZExtValue();
-
+ OneConstExtractIndex = ExtractIdx->getZExtValue();
++NumExtracts;
// Have we seen this input vector before?
@@ -23878,14 +23877,16 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
if (VecIn.size() == 2) {
// If we only found a single constant indexed extract_vector_elt feeding the
// build_vector, do not produce a more complicated shuffle if the extract is
- // cheap.
-
- // TODO: This should be more aggressive about skipping the shuffle formation
- // (e.g., always do this for VecIn[1]->hasOneUse())
- if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
+ // cheap with other constant/undef elements. Skip broadcast patterns with
+ // multiple uses in the build_vector.
+
+ // TODO: This should be more aggressive about skipping the shuffle
+ // formation, particularly if VecIn[1].hasOneUse(), and regardless of the
+ // index.
+ if (NumExtracts == 1 &&
+ TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
TLI.isTypeLegal(VT.getVectorElementType()) &&
- // VecIn[1].hasOneUse() &&
- NumExtracts == 1 && TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
+ TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
return SDValue();
unsigned MaxIndex = 0;
More information about the llvm-commits
mailing list