[llvm] 21d02dc - [X86][SSE] SimplifyDemandedVectorEltsForTargetNode - add general shuffle combining support
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 2 01:25:11 PDT 2020
Author: Simon Pilgrim
Date: 2020-09-02T09:24:46+01:00
New Revision: 21d02dc595797677a533c6b0508c0c9235bc4f13
URL: https://github.com/llvm/llvm-project/commit/21d02dc595797677a533c6b0508c0c9235bc4f13
DIFF: https://github.com/llvm/llvm-project/commit/21d02dc595797677a533c6b0508c0c9235bc4f13.diff
LOG: [X86][SSE] SimplifyDemandedVectorEltsForTargetNode - add general shuffle combining support
This patch uses partial DemandedElts masks to further simplify target shuffle chains and finally starts making target shuffle combining part of SimplifyDemandedBits/SimplifyDemandedVectorElts.
We already manage this for Depth == 0 cases, where combineX86ShuffleChain would early-out if the shuffle combined to the same op, but the patch generalizes this by manipulating the depth handling of combineX86ShufflesRecursively - calling with a new Depth = 0 and reducing the maximum shuffle combine depth accordingly.
Differential Revision: https://reviews.llvm.org/D66004
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avg.ll
llvm/test/CodeGen/X86/avx-trunc.ll
llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
llvm/test/CodeGen/X86/bitcast-setcc-128.ll
llvm/test/CodeGen/X86/buildvec-extract.ll
llvm/test/CodeGen/X86/buildvec-insertvec.ll
llvm/test/CodeGen/X86/combine-fcopysign.ll
llvm/test/CodeGen/X86/combine-shl.ll
llvm/test/CodeGen/X86/haddsub-undef.ll
llvm/test/CodeGen/X86/insert-into-constant-vector.ll
llvm/test/CodeGen/X86/insertelement-shuffle.ll
llvm/test/CodeGen/X86/known-signbits-vector.ll
llvm/test/CodeGen/X86/load-partial.ll
llvm/test/CodeGen/X86/masked_expandload.ll
llvm/test/CodeGen/X86/masked_load.ll
llvm/test/CodeGen/X86/masked_store_trunc.ll
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/oddsubvector.ll
llvm/test/CodeGen/X86/pmul.ll
llvm/test/CodeGen/X86/pmulh.ll
llvm/test/CodeGen/X86/pr29112.ll
llvm/test/CodeGen/X86/promote-cmp.ll
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/shrink_vmul.ll
llvm/test/CodeGen/X86/shuffle-of-insert.ll
llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/test-shrink-bug.ll
llvm/test/CodeGen/X86/trunc-subvector.ll
llvm/test/CodeGen/X86/udiv_fix.ll
llvm/test/CodeGen/X86/udiv_fix_sat.ll
llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
llvm/test/CodeGen/X86/vec_insert-2.ll
llvm/test/CodeGen/X86/vec_insert-3.ll
llvm/test/CodeGen/X86/vec_insert-5.ll
llvm/test/CodeGen/X86/vec_int_to_fp.ll
llvm/test/CodeGen/X86/vec_set-6.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
llvm/test/CodeGen/X86/vector-pack-256.ll
llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
llvm/test/CodeGen/X86/vector-reduce-mul.ll
llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-combining.ll
llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
llvm/test/CodeGen/X86/vector-trunc-math.ll
llvm/test/CodeGen/X86/vector-trunc.ll
llvm/test/CodeGen/X86/vector-zext.ll
llvm/test/CodeGen/X86/vselect.ll
llvm/test/CodeGen/X86/vshift-4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7440f3238448..dacc6e232830 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34171,11 +34171,18 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
- if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
- isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
- Shuffle = X86ISD::VZEXT_MOVL;
- SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
- return true;
+ if (MaskEltSize == 32 && Mask[0] == 0) {
+ if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+ if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
}
// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
@@ -35032,16 +35039,30 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// from a scalar.
// TODO: Handle other insertions here as well?
if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
- MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
- !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
- SDValue SrcV1 = V1, SrcV2 = V2;
- if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
- SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ Subtarget.hasSSE41() && !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
+ if (MaskEltSizeInBits == 32) {
+ SDValue SrcV1 = V1, SrcV2 = V2;
+ if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
+ DAG) &&
+ SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+ return SDValue(); // Nothing to do!
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, SrcV1),
+ DAG.getBitcast(MVT::v4f32, SrcV2),
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+ if (MaskEltSizeInBits == 64 && isTargetShuffleEquivalent(Mask, {0, 2}) &&
+ V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ V2.getScalarValueSizeInBits() <= 32) {
if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
return SDValue(); // Nothing to do!
+ PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
- DAG.getBitcast(MVT::v4f32, SrcV1),
- DAG.getBitcast(MVT::v4f32, SrcV2),
+ DAG.getBitcast(MVT::v4f32, V1),
+ DAG.getBitcast(MVT::v4f32, V2),
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
@@ -35704,6 +35725,14 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
return DAG.getBitcast(VT, CstOp);
}
+namespace llvm {
+ namespace X86 {
+ enum {
+ MaxShuffleCombineDepth = 8
+ };
+ };
+}; // namespace llvm
+
/// Fully generic combining of x86 shuffle instructions.
///
/// This should be the last combine run over the x86 shuffle instructions. Once
@@ -35736,8 +35765,8 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
static SDValue combineX86ShufflesRecursively(
ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
- bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+ unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
assert(RootMask.size() > 0 &&
(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
"Illegal shuffle root mask");
@@ -35747,8 +35776,7 @@ static SDValue combineX86ShufflesRecursively(
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
- const unsigned MaxRecursionDepth = 8;
- if (Depth >= MaxRecursionDepth)
+ if (Depth >= MaxDepth)
return SDValue();
// Directly rip through bitcasts to find the underlying operand.
@@ -35947,7 +35975,7 @@ static SDValue combineX86ShufflesRecursively(
// shuffles to avoid constant pool bloat.
// Don't recurse if we already have more source ops than we can combine in
// the remaining recursion depth.
- if (Ops.size() < (MaxRecursionDepth - Depth)) {
+ if (Ops.size() < (MaxDepth - Depth)) {
for (int i = 0, e = Ops.size(); i < e; ++i) {
// For empty roots, we need to resolve zeroable elements before combining
// them with other shuffles.
@@ -35959,7 +35987,7 @@ static SDValue combineX86ShufflesRecursively(
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
AllowVar = AllowVariableMask;
if (SDValue Res = combineX86ShufflesRecursively(
- Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
+ Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
HasVariableMask, AllowVar, DAG, Subtarget))
return Res;
}
@@ -36011,6 +36039,7 @@ static SDValue combineX86ShufflesRecursively(
static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
+ X86::MaxShuffleCombineDepth,
/*HasVarMask*/ false,
/*AllowVarMask*/ true, DAG, Subtarget);
}
@@ -36316,6 +36345,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
DemandedMask[i] = i;
if (SDValue Res = combineX86ShufflesRecursively(
{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
+ X86::MaxShuffleCombineDepth,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
@@ -37799,16 +37829,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// If we don't demand all elements, then attempt to combine to a simpler
// shuffle.
- // TODO: Handle other depths, but first we need to handle the fact that
- // it might combine to the same shuffle.
- if (!DemandedElts.isAllOnesValue() && Depth == 0) {
+ // We need to convert the depth to something combineX86ShufflesRecursively
+ // can handle - so pretend its Depth == 0 again, and reduce the max depth
+ // to match. This prevents combineX86ShuffleChain from returning a
+ // combined shuffle that's the same as the original root, causing an
+ // infinite loop.
+ if (!DemandedElts.isAllOnesValue()) {
+ assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
+
SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i)
if (DemandedElts[i])
DemandedMask[i] = i;
SDValue NewShuffle = combineX86ShufflesRecursively(
- {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
+ {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
+ /*HasVarMask*/ false,
/*AllowVarMask*/ true, TLO.DAG, Subtarget);
if (NewShuffle)
return TLO.CombineTo(Op, NewShuffle);
@@ -43406,6 +43442,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue Shuffle = combineX86ShufflesRecursively(
{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
+ X86::MaxShuffleCombineDepth,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
N->getOperand(0).getOperand(1));
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 8cd4b2f2571f..051493a4ab57 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -2746,12 +2746,12 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX2-NEXT: vmovq %rax, %xmm7
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1]
+; AVX2-NEXT: vpbroadcastw %xmm8, %xmm8
; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0
; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7]
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3]
diff --git a/llvm/test/CodeGen/X86/avx-trunc.ll b/llvm/test/CodeGen/X86/avx-trunc.ll
index 49f57fac70dc..1b1a9b2ded13 100644
--- a/llvm/test/CodeGen/X86/avx-trunc.ll
+++ b/llvm/test/CodeGen/X86/avx-trunc.ll
@@ -16,7 +16,7 @@ define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
; CHECK-LABEL: trunc_32_16:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index 2bc798c06b77..120d0ec83c4a 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -6637,7 +6637,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
@@ -6845,7 +6845,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
index 0510dadee263..83192cb0ee70 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -182,11 +182,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movmskpd %xmm1, %eax
@@ -249,10 +249,10 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; SSE2-SSSE3-LABEL: v2i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll
index c3b2934fa781..501d375277a3 100644
--- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -148,7 +148,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -195,7 +195,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; SSE2-SSSE3-LABEL: v2i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index 19f9155d851a..1cbf9078f3f2 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -407,13 +407,13 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) {
; SSE-LABEL: extract1_i16_zext_insert0_i64_undef:
; SSE: # %bb.0:
-; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract1_i16_zext_insert0_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 1
@@ -650,8 +650,7 @@ define <2 x i64> @extract2_i16_zext_insert1_i64_zero(<8 x i16> %x) {
define <2 x i64> @extract3_i16_zext_insert1_i64_undef(<8 x i16> %x) {
; SSE2-LABEL: extract3_i16_zext_insert1_i64_undef:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlq $48, %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 8758779d97c5..abe1fa0d3b5d 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -50,12 +50,11 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
define <4 x float> @test_negative_zero_1(<4 x float> %A) {
; SSE2-LABEL: test_negative_zero_1:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll
index 6ed5fb5f49b6..e4d7239cbb44 100644
--- a/llvm/test/CodeGen/X86/combine-fcopysign.ll
+++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll
@@ -266,7 +266,7 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: orps %xmm5, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm6[0],xmm3[3]
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-NEXT: andps %xmm4, %xmm0
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index 383d1866aa1d..2ea90e110742 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -414,10 +414,10 @@ define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $5, %xmm1
+; SSE2-NEXT: psrad $3, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $3, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
+; SSE2-NEXT: psrad $5, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $8, %xmm1
; SSE2-NEXT: psrad $4, %xmm0
@@ -476,10 +476,10 @@ define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $7, %xmm1
+; SSE2-NEXT: psrad $5, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $5, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
+; SSE2-NEXT: psrad $7, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $8, %xmm1
; SSE2-NEXT: psrad $6, %xmm0
@@ -541,10 +541,10 @@ define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_gt_lshr1:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $5, %xmm1
+; SSE2-NEXT: psrld $3, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $3, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
+; SSE2-NEXT: psrld $5, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $8, %xmm1
; SSE2-NEXT: psrld $4, %xmm0
@@ -606,10 +606,10 @@ define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_le_lshr1:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $7, %xmm1
+; SSE2-NEXT: psrld $5, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $5, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
+; SSE2-NEXT: psrld $7, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $8, %xmm1
; SSE2-NEXT: psrld $6, %xmm0
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index 09bc1decbf8b..0e329715436b 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -1051,7 +1051,7 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
; SSE-SLOW-NEXT: addps %xmm1, %xmm2
; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2]
; SSE-SLOW-NEXT: addps %xmm1, %xmm3
-; SSE-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[0,0]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[0,3]
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
; SSE-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 886fa6d07c5e..affec3ccb1dc 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -85,9 +85,9 @@ define <8 x i16> @elt5_v8i16(i16 %x) {
define <4 x i32> @elt3_v4i32(i32 %x) {
; X32SSE2-LABEL: elt3_v4i32:
; X32SSE2: # %bb.0:
-; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u>
; X32SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u>
+; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X32SSE2-NEXT: retl
;
@@ -95,7 +95,7 @@ define <4 x i32> @elt3_v4i32(i32 %x) {
; X64SSE2: # %bb.0:
; X64SSE2-NEXT: movd %edi, %xmm1
; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u>
-; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X64SSE2-NEXT: retq
;
@@ -166,14 +166,14 @@ define <4 x float> @elt1_v4f32(float %x) {
; X32SSE2: # %bb.0:
; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
; X32SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X32SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; X32SSE2-NEXT: retl
;
; X64SSE2-LABEL: elt1_v4f32:
; X64SSE2: # %bb.0:
; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
-; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X64SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; X64SSE2-NEXT: retq
;
@@ -237,9 +237,9 @@ define <2 x double> @elt1_v2f64(double %x) {
define <8 x i32> @elt7_v8i32(i32 %x) {
; X32SSE2-LABEL: elt7_v8i32:
; X32SSE2: # %bb.0:
-; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u>
; X32SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u>
+; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3]
; X32SSE2-NEXT: retl
@@ -248,7 +248,7 @@ define <8 x i32> @elt7_v8i32(i32 %x) {
; X64SSE2: # %bb.0:
; X64SSE2-NEXT: movd %edi, %xmm0
; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u>
-; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3]
; X64SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll
index 1b2a85eba6a2..000466f598bb 100644
--- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll
+++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll
@@ -9,7 +9,7 @@ define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounw
; X86: # %bb.0:
; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X86-NEXT: vpbroadcastd %xmm1, %xmm1
; X86-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X86-NEXT: retl
;
@@ -17,7 +17,7 @@ define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounw
; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm1
; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X64-NEXT: vpbroadcastd %xmm1, %xmm1
; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X64-NEXT: retq
%ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index 1561fbc844d0..31793cf87cb7 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -321,20 +321,12 @@ define <2 x double> @signbits_sext_shl_sitofp(<2 x i16> %a0) nounwind {
ret <2 x double> %3
}
-; TODO: Fix vpshufd+vpsrlq -> vpshufd/vpermilps
define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
-; X86-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
-; X86: # %bb.0:
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-NEXT: vcvtdq2pd %xmm0, %xmm0
-; X86-NEXT: retl
-;
-; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
-; X64: # %bb.0:
-; X64-NEXT: vpsrlq $32, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: vcvtdq2pd %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%1 = ashr <2 x i64> %a0, <i64 16, i64 16>
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%3 = shufflevector <4 x i64> %a1, <4 x i64> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll
index 29984987c772..4f60c4475f19 100644
--- a/llvm/test/CodeGen/X86/load-partial.ll
+++ b/llvm/test/CodeGen/X86/load-partial.ll
@@ -346,57 +346,40 @@ define i32 @load_partial_illegal_type() {
}
define void @PR43227(i32* %explicit_0, <8 x i32>* %explicit_1) {
-; SSE2-LABEL: PR43227:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT: movaps %xmm1, 672(%rsi)
-; SSE2-NEXT: movaps %xmm2, 688(%rsi)
-; SSE2-NEXT: retq
+; SSE-LABEL: PR43227:
+; SSE: # %bb.0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm1, 672(%rsi)
+; SSE-NEXT: movdqa %xmm0, 688(%rsi)
+; SSE-NEXT: retq
;
-; SSSE3-LABEL: PR43227:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: xorps %xmm2, %xmm2
-; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSSE3-NEXT: movaps %xmm1, 672(%rsi)
-; SSSE3-NEXT: movaps %xmm2, 688(%rsi)
-; SSSE3-NEXT: retq
+; AVX1-LABEL: PR43227:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, 672(%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
-; SSE41-LABEL: PR43227:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT: movdqa %xmm1, 672(%rsi)
-; SSE41-NEXT: movdqa %xmm0, 688(%rsi)
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: PR43227:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT: vmovaps %ymm0, 672(%rsi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-LABEL: PR43227:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, 672(%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
%1 = getelementptr i32, i32* %explicit_0, i64 63
%2 = bitcast i32* %1 to <3 x i32>*
%3 = load <3 x i32>, <3 x i32>* %2, align 1
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 8c1cc23165c9..b4b3f8cf850f 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -1136,7 +1136,7 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x
; SSE2-NEXT: je LBB4_4
; SSE2-NEXT: LBB4_3: ## %cond.load1
; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -1222,11 +1222,9 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x
define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) {
; SSE2-LABEL: expandload_v4f32_const:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0]
+; SSE2-NEXT: movss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[0,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -1275,19 +1273,15 @@ define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) {
define <16 x float> @expandload_v16f32_const(float* %base, <16 x float> %src0) {
; SSE2-LABEL: expandload_v16f32_const:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movss 52(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero
-; SSE2-NEXT: movaps %xmm4, %xmm6
-; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
-; SSE2-NEXT: movss 40(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero
-; SSE2-NEXT: movaps %xmm5, %xmm7
-; SSE2-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
; SSE2-NEXT: movups (%rdi), %xmm0
; SSE2-NEXT: movups 16(%rdi), %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm7[2,0]
+; SSE2-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero
+; SSE2-NEXT: movss 52(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero
+; SSE2-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero
+; SSE2-NEXT: movss 40(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[0,3]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm6[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm6[0,3]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
; SSE2-NEXT: movaps %xmm5, %xmm2
; SSE2-NEXT: movaps %xmm4, %xmm3
@@ -1520,7 +1514,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_64
; SSE2-NEXT: LBB8_63: ## %cond.load121
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,0]
; SSE2-NEXT: LBB8_64: ## %else122
; SSE2-NEXT: movaps %xmm0, (%rax)
@@ -1540,7 +1534,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_4
; SSE2-NEXT: LBB8_3: ## %cond.load1
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: movaps %xmm8, %xmm0
@@ -1555,7 +1549,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_8
; SSE2-NEXT: LBB8_7: ## %cond.load9
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testb $16, %cl
@@ -1568,7 +1562,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_12
; SSE2-NEXT: LBB8_11: ## %cond.load17
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm1[2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: movaps %xmm8, %xmm1
@@ -1583,7 +1577,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_16
; SSE2-NEXT: LBB8_15: ## %cond.load25
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $256, %ecx ## imm = 0x100
@@ -1596,7 +1590,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_20
; SSE2-NEXT: LBB8_19: ## %cond.load33
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: movaps %xmm8, %xmm2
@@ -1611,7 +1605,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_24
; SSE2-NEXT: LBB8_23: ## %cond.load41
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $4096, %ecx ## imm = 0x1000
@@ -1624,7 +1618,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_28
; SSE2-NEXT: LBB8_27: ## %cond.load49
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: movaps %xmm8, %xmm3
@@ -1639,7 +1633,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_32
; SSE2-NEXT: LBB8_31: ## %cond.load57
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $65536, %ecx ## imm = 0x10000
@@ -1652,7 +1646,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_36
; SSE2-NEXT: LBB8_35: ## %cond.load65
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: movaps %xmm8, %xmm4
@@ -1667,7 +1661,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_40
; SSE2-NEXT: LBB8_39: ## %cond.load73
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $1048576, %ecx ## imm = 0x100000
@@ -1680,7 +1674,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_44
; SSE2-NEXT: LBB8_43: ## %cond.load81
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: movaps %xmm8, %xmm5
@@ -1695,7 +1689,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_48
; SSE2-NEXT: LBB8_47: ## %cond.load89
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $16777216, %ecx ## imm = 0x1000000
@@ -1708,7 +1702,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_52
; SSE2-NEXT: LBB8_51: ## %cond.load97
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: movaps %xmm8, %xmm6
@@ -1723,7 +1717,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_56
; SSE2-NEXT: LBB8_55: ## %cond.load105
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $268435456, %ecx ## imm = 0x10000000
@@ -1736,7 +1730,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: je LBB8_60
; SSE2-NEXT: LBB8_59: ## %cond.load113
; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: movaps %xmm8, %xmm7
@@ -2814,7 +2808,7 @@ define <4 x i32> @expandload_v4i32_v4i32(i32* %base, <4 x i32> %src0, <4 x i32>
; SSE2-NEXT: je LBB10_4
; SSE2-NEXT: LBB10_3: ## %cond.load1
; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE2-NEXT: addq $4, %rdi
; SSE2-NEXT: movaps %xmm1, %xmm0
@@ -2829,7 +2823,7 @@ define <4 x i32> @expandload_v4i32_v4i32(i32* %base, <4 x i32> %src0, <4 x i32>
; SSE2-NEXT: je LBB10_8
; SSE2-NEXT: LBB10_7: ## %cond.load9
; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 4f5687109afc..75e41618263e 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -742,7 +742,7 @@ define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2
; SSE2-NEXT: je LBB7_4
; SSE2-NEXT: LBB7_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
@@ -834,7 +834,7 @@ define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, <2 x float>* %add
; SSE2-NEXT: je LBB8_4
; SSE2-NEXT: LBB8_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -927,7 +927,7 @@ define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, <4 x float>* %addr, <4
; SSE2-NEXT: je LBB9_4
; SSE2-NEXT: LBB9_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: testb $4, %al
@@ -940,7 +940,7 @@ define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, <4 x float>* %addr, <4
; SSE2-NEXT: je LBB9_8
; SSE2-NEXT: LBB9_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -1051,8 +1051,8 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, <8 x float>* %addr) {
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB10_4
; SSE2-NEXT: LBB10_3: ## %cond.load1
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: testb $4, %al
@@ -1065,7 +1065,7 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, <8 x float>* %addr) {
; SSE2-NEXT: je LBB10_8
; SSE2-NEXT: LBB10_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB10_10
@@ -1076,7 +1076,7 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, <8 x float>* %addr) {
; SSE2-NEXT: je LBB10_12
; SSE2-NEXT: LBB10_11: ## %cond.load13
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: testb $64, %al
@@ -1089,7 +1089,7 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, <8 x float>* %addr) {
; SSE2-NEXT: je LBB10_16
; SSE2-NEXT: LBB10_15: ## %cond.load19
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
; SSE2-NEXT: retq
;
@@ -1237,7 +1237,7 @@ define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, <8 x float>* %addr, <8
; SSE2-NEXT: je LBB11_16
; SSE2-NEXT: LBB11_15: ## %cond.load19
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
; SSE2-NEXT: LBB11_16: ## %else20
; SSE2-NEXT: movaps %xmm2, %xmm0
@@ -1250,7 +1250,7 @@ define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, <8 x float>* %addr, <8
; SSE2-NEXT: je LBB11_4
; SSE2-NEXT: LBB11_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: testb $4, %al
@@ -1263,7 +1263,7 @@ define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, <8 x float>* %addr, <8
; SSE2-NEXT: je LBB11_8
; SSE2-NEXT: LBB11_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB11_10
@@ -1274,7 +1274,7 @@ define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, <8 x float>* %addr, <8
; SSE2-NEXT: je LBB11_12
; SSE2-NEXT: LBB11_11: ## %cond.load13
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm3
; SSE2-NEXT: testb $64, %al
@@ -2066,7 +2066,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3
; SSE2-NEXT: je LBB17_4
; SSE2-NEXT: LBB17_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
@@ -2172,7 +2172,7 @@ define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i3
; SSE2-NEXT: je LBB18_4
; SSE2-NEXT: LBB18_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: testb $4, %al
@@ -2185,7 +2185,7 @@ define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i3
; SSE2-NEXT: je LBB18_8
; SSE2-NEXT: LBB18_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -2295,7 +2295,7 @@ define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %d
; SSE2-NEXT: je LBB19_16
; SSE2-NEXT: LBB19_15: ## %cond.load19
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
; SSE2-NEXT: LBB19_16: ## %else20
; SSE2-NEXT: movaps %xmm1, %xmm0
@@ -2308,7 +2308,7 @@ define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %d
; SSE2-NEXT: je LBB19_4
; SSE2-NEXT: LBB19_3: ## %cond.load1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: testb $4, %al
@@ -2321,7 +2321,7 @@ define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %d
; SSE2-NEXT: je LBB19_8
; SSE2-NEXT: LBB19_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB19_10
@@ -2332,7 +2332,7 @@ define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %d
; SSE2-NEXT: je LBB19_12
; SSE2-NEXT: LBB19_11: ## %cond.load13
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: testb $64, %al
@@ -2495,8 +2495,8 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, <8 x i32>* %addr) {
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB20_4
; SSE2-NEXT: LBB20_3: ## %cond.load1
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: testb $4, %al
@@ -2509,7 +2509,7 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, <8 x i32>* %addr) {
; SSE2-NEXT: je LBB20_8
; SSE2-NEXT: LBB20_7: ## %cond.load7
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB20_10
@@ -2520,7 +2520,7 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, <8 x i32>* %addr) {
; SSE2-NEXT: je LBB20_12
; SSE2-NEXT: LBB20_11: ## %cond.load13
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: testb $64, %al
@@ -2533,7 +2533,7 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, <8 x i32>* %addr) {
; SSE2-NEXT: je LBB20_16
; SSE2-NEXT: LBB20_15: ## %cond.load19
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
; SSE2-NEXT: retq
;
@@ -6120,17 +6120,15 @@ define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst)
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
; SSE2-NEXT: retq
;
; SSE42-LABEL: mload_constmask_v4f32:
; SSE42: ## %bb.0:
; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; SSE42-NEXT: retq
@@ -6216,13 +6214,11 @@ define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
; SSE2-LABEL: mload_constmask_v4i32:
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,2]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -6301,7 +6297,7 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst)
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
@@ -6311,7 +6307,7 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst)
; SSE42-LABEL: mload_constmask_v8f32:
; SSE42: ## %bb.0:
; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: retq
@@ -6443,12 +6439,12 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
; SSE2: ## %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 6cf5ff023f3c..b806c2c07695 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -3472,7 +3472,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, <8 x i16>* %p, <8 x i32> %mask
; AVX1-LABEL: truncstore_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 6b65fee913bf..f5f8781d8021 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -512,16 +512,16 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
; SSE2-LABEL: v12i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm2, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[1,3]
; SSE2-NEXT: movaps %xmm0, %xmm4
; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,1]
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
; SSE2-NEXT: movaps %xmm2, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,0]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3]
; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,2]
; SSE2-NEXT: movaps %xmm2, 32(%rdi)
@@ -908,7 +908,7 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1]
; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0]
@@ -1186,41 +1186,39 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # %bb.0:
; SSE2-NEXT: movups 80(%rdi), %xmm8
-; SSE2-NEXT: movups 64(%rdi), %xmm4
-; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movups 16(%rdi), %xmm6
+; SSE2-NEXT: movups 64(%rdi), %xmm3
+; SSE2-NEXT: movdqu (%rdi), %xmm1
+; SSE2-NEXT: movups 16(%rdi), %xmm5
; SSE2-NEXT: movups 32(%rdi), %xmm10
-; SSE2-NEXT: movups 48(%rdi), %xmm12
-; SSE2-NEXT: movdqa %xmm0, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[1,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm6[0,2]
-; SSE2-NEXT: movaps %xmm12, %xmm6
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[0,0]
-; SSE2-NEXT: movaps %xmm6, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[1,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm4[0,2]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm2[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm8[2,0]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,2],xmm1[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,0]
-; SSE2-NEXT: movups %xmm12, 16(%rsi)
+; SSE2-NEXT: movdqu 48(%rdi), %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: movaps %xmm10, %xmm7
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm5[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm10[1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm5[0,2]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: movaps %xmm8, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm8[1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[2,0]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[0,3]
+; SSE2-NEXT: movups %xmm5, 16(%rsi)
; SSE2-NEXT: movups %xmm11, (%rsi)
-; SSE2-NEXT: movups %xmm6, 16(%rdx)
-; SSE2-NEXT: movups %xmm0, (%rdx)
-; SSE2-NEXT: movups %xmm5, 16(%rcx)
-; SSE2-NEXT: movups %xmm7, (%rcx)
+; SSE2-NEXT: movups %xmm2, 16(%rdx)
+; SSE2-NEXT: movups %xmm1, (%rdx)
+; SSE2-NEXT: movups %xmm6, 16(%rcx)
+; SSE2-NEXT: movups %xmm0, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
@@ -1420,29 +1418,29 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; SSE2-NEXT: movups (%rcx), %xmm3
; SSE2-NEXT: movups 16(%rcx), %xmm6
; SSE2-NEXT: movaps %xmm3, %xmm7
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm1[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[1,3]
; SSE2-NEXT: movaps %xmm1, %xmm9
; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[0,2]
; SSE2-NEXT: movaps %xmm5, %xmm7
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,2],xmm6[3,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3]
; SSE2-NEXT: movaps %xmm6, %xmm4
; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,2]
; SSE2-NEXT: movaps %xmm0, %xmm7
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,1]
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1]
; SSE2-NEXT: movaps %xmm6, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[1,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm0[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[1,3]
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2]
; SSE2-NEXT: movaps %xmm8, %xmm5
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,2],xmm3[3,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3]
; SSE2-NEXT: movaps %xmm3, %xmm6
; SSE2-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1]
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[1,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2]
; SSE2-NEXT: movups %xmm3, 16(%rdi)
@@ -1498,20 +1496,20 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; AVX1-LABEL: interleave_24i32_in:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovups (%rdx), %xmm0
-; AVX1-NEXT: vmovups 16(%rdx), %xmm1
-; AVX1-NEXT: vmovups (%rsi), %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm0[2,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
-; AVX1-NEXT: vmovups 16(%rcx), %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,0],xmm2[3,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,1],xmm3[0,2]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,2]
+; AVX1-NEXT: vmovups (%rsi), %xmm1
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm2[0,2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rcx), %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX1-NEXT: vmovups 16(%rcx), %xmm1
+; AVX1-NEXT: vmovups 16(%rdx), %xmm2
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,0],xmm1[3,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,1],xmm3[0,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,2]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vbroadcastsd 24(%rsi), %ymm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
@@ -1589,20 +1587,20 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; XOP-NEXT: vmovups (%rcx), %ymm1
; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[u,3],ymm0[4],ymm1[u,4],ymm0[5]
; XOP-NEXT: vmovups (%rdx), %xmm1
-; XOP-NEXT: vmovups 16(%rdx), %xmm2
-; XOP-NEXT: vmovups (%rsi), %xmm3
-; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0]
-; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
-; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0]
-; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
-; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; XOP-NEXT: vbroadcastsd (%rcx), %ymm3
-; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
-; XOP-NEXT: vmovups 16(%rcx), %xmm3
-; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0]
-; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2]
-; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0]
-; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2]
+; XOP-NEXT: vmovups (%rsi), %xmm2
+; XOP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1]
+; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
+; XOP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1]
+; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOP-NEXT: vbroadcastsd (%rcx), %ymm2
+; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; XOP-NEXT: vmovups 16(%rcx), %xmm2
+; XOP-NEXT: vmovups 16(%rdx), %xmm3
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,0],xmm2[3,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,1],xmm4[0,2]
+; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[1,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,2]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; XOP-NEXT: vbroadcastsd 24(%rsi), %ymm3
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index 46ff47b2a100..f8a112a1beae 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -23,33 +23,19 @@ define void @insert_v7i8_v2i16_2(<7 x i8> *%a0, <2 x i16> *%a1) nounwind {
; SSE-NEXT: movd %xmm1, (%rdi)
; SSE-NEXT: retq
;
-; AVX1-LABEL: insert_v7i8_v2i16_2:
-; AVX1: # %bb.0:
-; AVX1-NEXT: movl (%rsi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: movq (%rdi), %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: movb %cl, 6(%rdi)
-; AVX1-NEXT: shrl $16, %eax
-; AVX1-NEXT: movw %ax, 4(%rdi)
-; AVX1-NEXT: vmovd %xmm0, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_v7i8_v2i16_2:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl (%rsi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: movq (%rdi), %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: movb %cl, 6(%rdi)
-; AVX2-NEXT: shrl $16, %eax
-; AVX2-NEXT: movw %ax, 4(%rdi)
-; AVX2-NEXT: vmovd %xmm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: insert_v7i8_v2i16_2:
+; AVX: # %bb.0:
+; AVX-NEXT: movl (%rsi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: movq (%rdi), %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: shrq $48, %rcx
+; AVX-NEXT: movb %cl, 6(%rdi)
+; AVX-NEXT: shrl $16, %eax
+; AVX-NEXT: movw %ax, 4(%rdi)
+; AVX-NEXT: vmovd %xmm0, (%rdi)
+; AVX-NEXT: retq
;
; AVX512-LABEL: insert_v7i8_v2i16_2:
; AVX512: # %bb.0:
@@ -64,20 +50,6 @@ define void @insert_v7i8_v2i16_2(<7 x i8> *%a0, <2 x i16> *%a1) nounwind {
; AVX512-NEXT: movw %ax, 4(%rdi)
; AVX512-NEXT: vmovd %xmm0, (%rdi)
; AVX512-NEXT: retq
-;
-; XOP-LABEL: insert_v7i8_v2i16_2:
-; XOP: # %bb.0:
-; XOP-NEXT: movl (%rsi), %eax
-; XOP-NEXT: vmovd %eax, %xmm0
-; XOP-NEXT: movq (%rdi), %rcx
-; XOP-NEXT: vmovq %rcx, %xmm1
-; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,1,2,3],xmm1[6,7,u,u,u,u,u,u,u,u]
-; XOP-NEXT: shrq $48, %rcx
-; XOP-NEXT: movb %cl, 6(%rdi)
-; XOP-NEXT: shrl $16, %eax
-; XOP-NEXT: movw %ax, 4(%rdi)
-; XOP-NEXT: vmovd %xmm1, (%rdi)
-; XOP-NEXT: retq
%1 = load <2 x i16>, <2 x i16> *%a1
%2 = bitcast <2 x i16> %1 to <4 x i8>
%3 = shufflevector <4 x i8> %2, <4 x i8> undef, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 5a3101bc3efd..84a4600b79b7 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1236,70 +1236,67 @@ entry:
define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE2-LABEL: mul_v8i64_sext:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: movdqa %xmm1, %xmm15
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT: psrad $16, %xmm14
; SSE2-NEXT: pxor %xmm13, %xmm13
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm13
-; SSE2-NEXT: movdqa %xmm10, %xmm9
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm14, %xmm10
+; SSE2-NEXT: movdqa %xmm14, %xmm8
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm15
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm15
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
; SSE2-NEXT: movdqa %xmm0, %xmm11
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE2-NEXT: pxor %xmm14, %xmm14
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm14
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1]
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm9
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm12
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm7
-; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1]
-; SSE2-NEXT: movdqa %xmm15, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm12, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm0, %xmm7
-; SSE2-NEXT: paddq %xmm4, %xmm7
-; SSE2-NEXT: psllq $32, %xmm7
-; SSE2-NEXT: pmuludq %xmm12, %xmm0
-; SSE2-NEXT: paddq %xmm7, %xmm0
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm15
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm11, %xmm6
-; SSE2-NEXT: paddq %xmm15, %xmm6
-; SSE2-NEXT: psllq $32, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
+; SSE2-NEXT: pcmpgtd %xmm15, %xmm13
+; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm15, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm4
+; SSE2-NEXT: paddq %xmm6, %xmm4
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm15, %xmm0
+; SSE2-NEXT: paddq %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm11, %xmm5
+; SSE2-NEXT: paddq %xmm4, %xmm5
+; SSE2-NEXT: psllq $32, %xmm5
; SSE2-NEXT: pmuludq %xmm11, %xmm1
-; SSE2-NEXT: paddq %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm13, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE2-NEXT: paddq %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm10, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm14, %xmm5
; SSE2-NEXT: paddq %xmm4, %xmm5
; SSE2-NEXT: psllq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm10, %xmm2
+; SSE2-NEXT: pmuludq %xmm14, %xmm2
; SSE2-NEXT: paddq %xmm5, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm13
-; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm9, %xmm14
-; SSE2-NEXT: paddq %xmm13, %xmm14
-; SSE2-NEXT: psllq $32, %xmm14
-; SSE2-NEXT: pmuludq %xmm9, %xmm3
-; SSE2-NEXT: paddq %xmm14, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm5
+; SSE2-NEXT: paddq %xmm4, %xmm5
+; SSE2-NEXT: psllq $32, %xmm5
+; SSE2-NEXT: pmuludq %xmm8, %xmm3
+; SSE2-NEXT: paddq %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v8i64_sext:
@@ -1350,8 +1347,8 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
define <2 x i64> @pmuldq_square(<2 x i64> %x) {
; SSE2-LABEL: pmuldq_square:
; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index ba33fed60f3f..72688a63a9d4 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -393,11 +393,10 @@ define <4 x i32> @mulhsw_v4i16_ashr(<4 x i16> %a, <4 x i16> %b) {
; SSE2-LABEL: mulhsw_v4i16_ashr:
; SSE2: # %bb.0:
; SSE2-NEXT: pmulhw %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: mulhsw_v4i16_ashr:
@@ -1505,34 +1504,30 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: mulhuw_v8i16_lshr_i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3]
; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; SSE2-NEXT: pmuludq %xmm7, %xmm0
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-NEXT: pmuludq %xmm6, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm3
; SSE2-NEXT: psrlq $16, %xmm0
-; SSE2-NEXT: psrlq $16, %xmm1
+; SSE2-NEXT: psrlq $16, %xmm4
; SSE2-NEXT: psrlq $16, %xmm2
; SSE2-NEXT: psrlq $16, %xmm3
+; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: mulhuw_v8i16_lshr_i64:
@@ -1571,71 +1566,66 @@ define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: mulhsw_v8i16_lshr_i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm12
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: pxor %xmm14, %xmm14
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm14
-; SSE2-NEXT: movdqa %xmm12, %xmm9
-; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2-NEXT: psrad $16, %xmm6
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm10
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm11
+; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: movdqa %xmm7, %xmm11
-; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm10
-; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm12
+; SSE2-NEXT: movdqa %xmm7, %xmm9
+; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm7, %xmm3
-; SSE2-NEXT: pmuludq %xmm1, %xmm7
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm6
-; SSE2-NEXT: paddq %xmm6, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm13
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: paddq %xmm2, %xmm3
; SSE2-NEXT: psllq $32, %xmm3
-; SSE2-NEXT: paddq %xmm7, %xmm3
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; SSE2-NEXT: paddq %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3]
; SSE2-NEXT: pmuludq %xmm11, %xmm2
-; SSE2-NEXT: pmuludq %xmm13, %xmm11
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm13, %xmm4
-; SSE2-NEXT: paddq %xmm4, %xmm2
+; SSE2-NEXT: pmuludq %xmm0, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: paddq %xmm1, %xmm2
; SSE2-NEXT: psllq $32, %xmm2
; SSE2-NEXT: paddq %xmm11, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm12, %xmm1
-; SSE2-NEXT: pmuludq %xmm5, %xmm12
-; SSE2-NEXT: movdqa %xmm14, %xmm4
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm5, %xmm4
-; SSE2-NEXT: paddq %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm1
+; SSE2-NEXT: pmuludq %xmm7, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm7, %xmm0
+; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: paddq %xmm12, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm9, %xmm0
-; SSE2-NEXT: pmuludq %xmm10, %xmm9
-; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm10, %xmm14
-; SSE2-NEXT: paddq %xmm14, %xmm0
+; SSE2-NEXT: paddq %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm0
+; SSE2-NEXT: pmuludq %xmm9, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm9, %xmm4
+; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: paddq %xmm9, %xmm0
+; SSE2-NEXT: paddq %xmm8, %xmm0
; SSE2-NEXT: psrlq $16, %xmm0
; SSE2-NEXT: psrlq $16, %xmm1
; SSE2-NEXT: psrlq $16, %xmm2
@@ -1678,71 +1668,66 @@ define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: mulhsw_v8i16_ashr_i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm12
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: pxor %xmm14, %xmm14
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm14
-; SSE2-NEXT: movdqa %xmm12, %xmm9
-; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE2-NEXT: movdqa %xmm7, %xmm11
-; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm10
-; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT: psrad $16, %xmm5
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm8
+; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm11
+; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm12
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm7, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm13
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm4
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm7
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm5
-; SSE2-NEXT: paddq %xmm5, %xmm6
-; SSE2-NEXT: psllq $32, %xmm6
-; SSE2-NEXT: paddq %xmm7, %xmm6
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm11, %xmm4
-; SSE2-NEXT: pmuludq %xmm13, %xmm11
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm13, %xmm2
-; SSE2-NEXT: paddq %xmm2, %xmm4
+; SSE2-NEXT: paddq %xmm7, %xmm4
; SSE2-NEXT: psllq $32, %xmm4
-; SSE2-NEXT: paddq %xmm11, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm12, %xmm1
-; SSE2-NEXT: pmuludq %xmm3, %xmm12
-; SSE2-NEXT: movdqa %xmm14, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm2
+; SSE2-NEXT: paddq %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm13[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm11, %xmm7
+; SSE2-NEXT: pmuludq %xmm6, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm1
+; SSE2-NEXT: paddq %xmm1, %xmm7
+; SSE2-NEXT: psllq $32, %xmm7
+; SSE2-NEXT: paddq %xmm11, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm1
+; SSE2-NEXT: pmuludq %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm2
; SSE2-NEXT: paddq %xmm2, %xmm1
; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: paddq %xmm12, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm9, %xmm0
-; SSE2-NEXT: pmuludq %xmm10, %xmm9
-; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
-; SSE2-NEXT: pmuludq %xmm10, %xmm14
-; SSE2-NEXT: paddq %xmm14, %xmm0
+; SSE2-NEXT: paddq %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm0
+; SSE2-NEXT: pmuludq %xmm9, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm9, %xmm2
+; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: paddq %xmm9, %xmm0
+; SSE2-NEXT: paddq %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -1755,18 +1740,18 @@ define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: psrlq $16, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrlq $16, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
+; SSE2-NEXT: psrlq $16, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
-; SSE2-NEXT: psrlq $16, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
+; SSE2-NEXT: psrlq $16, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: mulhsw_v8i16_ashr_i64:
diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll
index d07829725cf6..8db4d7713ada 100644
--- a/llvm/test/CodeGen/X86/pr29112.ll
+++ b/llvm/test/CodeGen/X86/pr29112.ll
@@ -34,7 +34,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3]
; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
+; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3]
@@ -42,7 +42,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8
-; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3],xmm11[3]
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2
; CHECK-NEXT: vmovaps %xmm13, %xmm1
diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll
index d6fcac28f62a..70ec11e87f06 100644
--- a/llvm/test/CodeGen/X86/promote-cmp.ll
+++ b/llvm/test/CodeGen/X86/promote-cmp.ll
@@ -29,13 +29,11 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
-; SSE2-NEXT: movaps {{.*#+}} xmm4 = <1,1,u,0>
-; SSE2-NEXT: xorps %xmm5, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1,3,3]
-; SSE2-NEXT: psllq $63, %xmm5
-; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE2-NEXT: pxor {{.*}}(%rip), %xmm4
+; SSE2-NEXT: psllq $63, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: por %xmm5, %xmm1
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 53ff270ad340..1ab456c380ad 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -558,7 +558,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
@@ -908,7 +908,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
@@ -1038,7 +1038,7 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 5a0f23f6a09c..f30364cbaed7 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -899,7 +899,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm1
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
@@ -935,7 +935,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: psraw $8, %xmm1
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
; X64-SSE-NEXT: psrad $16, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
@@ -1472,7 +1472,7 @@ define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
@@ -1497,7 +1497,7 @@ define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: psraw $8, %xmm0
; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X64-SSE-NEXT: psrad $16, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/shuffle-of-insert.ll b/llvm/test/CodeGen/X86/shuffle-of-insert.ll
index bf1d74852553..9c2fc9d19fe8 100644
--- a/llvm/test/CodeGen/X86/shuffle-of-insert.ll
+++ b/llvm/test/CodeGen/X86/shuffle-of-insert.ll
@@ -30,7 +30,7 @@ define <4 x i32> @ins_elt_1(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
; SSE2-LABEL: ins_elt_1:
; SSE2: # %bb.0:
; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
@@ -80,7 +80,7 @@ define <4 x i32> @ins_elt_3_commute(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: movd %edi, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
@@ -153,7 +153,7 @@ define <4 x i32> @ins_elt_2_to_3(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: movd %edi, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
@@ -176,7 +176,7 @@ define <4 x i32> @ins_elt_3_to_1(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
; SSE2-LABEL: ins_elt_3_to_1:
; SSE2: # %bb.0:
; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
index e0bc3240726f..d55fd278586d 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
@@ -13,19 +13,7 @@ define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind {
; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movq %xmm0, (%rsi)
; SSE2-NEXT: retq
@@ -670,10 +658,10 @@ define void @shuffle_v16i8_to_v2i8_7(<16 x i8>* %L, <2 x i8>* %S) nounwind {
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rsi)
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 11c1f2e76f83..fbc14807d053 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -187,7 +187,7 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 432faf5d12d1..e4be8f5a273b 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -435,10 +435,9 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512F-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 6c72adbc6317..9982f34f736d 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -1065,31 +1065,29 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2454267027,2147483649,1374389535]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm5
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0]
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
; CHECK-SSE2-NEXT: psrad $5, %xmm3
@@ -1391,31 +1389,29 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2454267027,0,1374389535]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm5
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0]
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
; CHECK-SSE2-NEXT: psrad $5, %xmm3
@@ -2694,31 +2690,29 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1717986919]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1717986919]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm5
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0]
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
; CHECK-SSE2-NEXT: psrad $1, %xmm3
@@ -2965,31 +2959,29 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1374389535]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1374389535]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm5
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0]
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
; CHECK-SSE2-NEXT: psrad $5, %xmm3
diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll
index c5cf32732d5c..b9f67d132119 100644
--- a/llvm/test/CodeGen/X86/test-shrink-bug.ll
+++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll
@@ -70,8 +70,8 @@ define void @fail(i16 %a, <2 x i8> %b) {
; CHECK-X64-NEXT: je .LBB1_3
; CHECK-X64-NEXT: # %bb.1:
; CHECK-X64-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
-; CHECK-X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+; CHECK-X64-NEXT: pextrw $4, %xmm0, %eax
; CHECK-X64-NEXT: testb $1, %al
; CHECK-X64-NEXT: jne .LBB1_3
; CHECK-X64-NEXT: # %bb.2: # %no
diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll
index 4c7acf60d308..1106c6b7b4ef 100644
--- a/llvm/test/CodeGen/X86/trunc-subvector.ll
+++ b/llvm/test/CodeGen/X86/trunc-subvector.ll
@@ -79,12 +79,8 @@ define <2 x i32> @test5(<8 x i32> %v) {
;
; AVX2-LABEL: test5:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -181,12 +177,8 @@ define <2 x i32> @test10(<8 x i32> %v) {
;
; AVX2-LABEL: test10:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
index e42e527553f1..1eef8c78cc5c 100644
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -238,36 +238,37 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-LABEL: vec:
; X64: # %bb.0:
; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: movdqa %xmm1, %xmm4
+; X64-NEXT: movdqa %xmm1, %xmm3
+; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NEXT: movq %xmm3, %rcx
+; X64-NEXT: movdqa %xmm0, %xmm4
; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; X64-NEXT: movq %xmm4, %rcx
-; X64-NEXT: movdqa %xmm0, %xmm5
-; X64-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X64-NEXT: psllq $31, %xmm5
-; X64-NEXT: movq %xmm5, %rax
+; X64-NEXT: psllq $31, %xmm4
+; X64-NEXT: movq %xmm4, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm3
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X64-NEXT: movq %xmm4, %rcx
-; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
; X64-NEXT: movq %xmm4, %rax
+; X64-NEXT: movdqa %xmm1, %xmm4
+; X64-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: movq %xmm4, %rcx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm4
; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-NEXT: movq %xmm1, %rcx
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X64-NEXT: movq %xmm2, %rcx
; X64-NEXT: psllq $31, %xmm0
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, %rcx
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NEXT: movq %xmm0, %rax
+; X64-NEXT: psrlq $32, %xmm1
+; X64-NEXT: movq %xmm1, %rcx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm0
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index f6c8baf2e923..d2e3b80c2145 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -323,22 +323,23 @@ define i16 @func7(i16 %x, i16 %y) nounwind {
define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-LABEL: vec:
; X64: # %bb.0:
-; X64-NEXT: pxor %xmm8, %xmm8
+; X64-NEXT: pxor %xmm3, %xmm3
; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X64-NEXT: movq %xmm2, %rcx
-; X64-NEXT: movdqa %xmm0, %xmm4
-; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
-; X64-NEXT: paddq %xmm4, %xmm4
-; X64-NEXT: psllq $31, %xmm4
-; X64-NEXT: movq %xmm4, %rax
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-NEXT: paddq %xmm2, %xmm2
+; X64-NEXT: psllq $31, %xmm2
+; X64-NEXT: movq %xmm2, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm7
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; X64-NEXT: movq %xmm2, %rcx
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
; X64-NEXT: movq %xmm2, %rax
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: movq %xmm2, %rcx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm2
@@ -346,13 +347,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
; X64-NEXT: movdqa %xmm7, %xmm2
; X64-NEXT: pxor %xmm4, %xmm2
-; X64-NEXT: movdqa {{.*#+}} xmm9 = [9223372043297226751,9223372043297226751]
-; X64-NEXT: movdqa %xmm9, %xmm6
+; X64-NEXT: movdqa {{.*#+}} xmm8 = [9223372043297226751,9223372043297226751]
+; X64-NEXT: movdqa %xmm8, %xmm6
; X64-NEXT: pcmpgtd %xmm2, %xmm6
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; X64-NEXT: pcmpeqd %xmm9, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; X64-NEXT: pcmpeqd %xmm8, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; X64-NEXT: pand %xmm3, %xmm5
+; X64-NEXT: pand %xmm9, %xmm5
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
; X64-NEXT: por %xmm5, %xmm2
; X64-NEXT: movdqa {{.*#+}} xmm6 = [8589934591,8589934591]
@@ -360,28 +361,28 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: pandn %xmm6, %xmm2
; X64-NEXT: por %xmm7, %xmm2
; X64-NEXT: psrlq $1, %xmm2
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
-; X64-NEXT: movq %xmm1, %rcx
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X64-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
+; X64-NEXT: movq %xmm3, %rcx
; X64-NEXT: paddq %xmm0, %xmm0
; X64-NEXT: psllq $31, %xmm0
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, %rcx
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NEXT: movq %xmm0, %rax
+; X64-NEXT: psrlq $32, %xmm1
+; X64-NEXT: movq %xmm1, %rcx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm0
; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; X64-NEXT: pxor %xmm3, %xmm4
-; X64-NEXT: movdqa %xmm9, %xmm0
+; X64-NEXT: movdqa %xmm8, %xmm0
; X64-NEXT: pcmpgtd %xmm4, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; X64-NEXT: pcmpeqd %xmm9, %xmm4
+; X64-NEXT: pcmpeqd %xmm8, %xmm4
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; X64-NEXT: pand %xmm1, %xmm4
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 923aaa34f04d..913e45bc2b6d 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -200,26 +200,24 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: psrld $31, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -300,26 +298,24 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: psrld $31, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
@@ -638,13 +634,12 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
@@ -722,25 +717,23 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -857,15 +850,15 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; CHECK-SSE41-NEXT: psrld $5, %xmm3
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -880,11 +873,11 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -989,25 +982,22 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -1098,20 +1088,18 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: movaps %xmm2, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,1,100]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; CHECK-SSE2-NEXT: psrld $5, %xmm1
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,14,1,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -1208,13 +1196,12 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
@@ -1292,25 +1279,23 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -1427,15 +1412,15 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; CHECK-SSE41-NEXT: psrld $5, %xmm3
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1450,11 +1435,11 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -1512,17 +1497,17 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,16,5]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-SSE2-NEXT: psrld $31, %xmm3
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,5]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -1536,15 +1521,15 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; CHECK-SSE41-NEXT: psrld $31, %xmm3
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1558,11 +1543,11 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -1614,24 +1599,23 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,14]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: psrld $31, %xmm4
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [14,4294967295,16,14]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -1648,15 +1632,15 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; CHECK-SSE41-NEXT: psrld $31, %xmm4
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm4
-; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1672,11 +1656,11 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -1722,25 +1706,28 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2
-; CHECK-SSE2-NEXT: psrld $5, %xmm2
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,100]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4
+; CHECK-SSE2-NEXT: psrld $5, %xmm4
+; CHECK-SSE2-NEXT: psrld $31, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
@@ -1758,13 +1745,13 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE41-NEXT: psrld $31, %xmm3
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1780,9 +1767,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -1875,24 +1863,21 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
; CHECK-SSE2-NEXT: psrld $31, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[3,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
@@ -1996,9 +1981,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
@@ -2099,18 +2082,16 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,1,5]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -2198,23 +2179,20 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
@@ -2315,9 +2293,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
@@ -2408,23 +2384,25 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT: psrld $31, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -2435,18 +2413,18 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT: psrld $2, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: psrld $2, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE41-NEXT: psrld $31, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
+; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2456,9 +2434,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
@@ -2507,26 +2486,27 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrld $1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
+; CHECK-SSE2-NEXT: psrld $1, %xmm4
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [14,4294967295,16,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT: psrld $31, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
@@ -2542,18 +2522,18 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
-; CHECK-SSE41-NEXT: psrld $2, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrld $31, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
+; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2565,9 +2545,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
index e549b222796d..bf3b6ce355f9 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
@@ -332,13 +332,11 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: t32_tautological:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vec_insert-2.ll b/llvm/test/CodeGen/X86/vec_insert-2.ll
index 2f06350b58b8..33de9dd931b0 100644
--- a/llvm/test/CodeGen/X86/vec_insert-2.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-2.ll
@@ -6,13 +6,13 @@ define <4 x float> @t1(float %s, <4 x float> %tmp) nounwind {
; X32-LABEL: t1:
; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X32-NEXT: retl
;
; X64-LABEL: t1:
; X64: # %bb.0:
-; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
@@ -24,14 +24,14 @@ define <4 x i32> @t2(i32 %s, <4 x i32> %tmp) nounwind {
; X32-LABEL: t2:
; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X32-NEXT: retl
;
; X64-LABEL: t2:
; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm1
-; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X64-NEXT: retq
%tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 3
diff --git a/llvm/test/CodeGen/X86/vec_insert-3.ll b/llvm/test/CodeGen/X86/vec_insert-3.ll
index c844945d2711..9fb6fa957036 100644
--- a/llvm/test/CodeGen/X86/vec_insert-3.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-3.ll
@@ -6,11 +6,9 @@ define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
; X32-LABEL: t1:
; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: movaps %xmm0, %xmm2
-; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
-; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; X32-NEXT: retl
;
; X64-LABEL: t1:
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index c6815a278f82..8a75881f01df 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -11,7 +11,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind {
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: shll $12, %ecx
; X32-NEXT: movd %ecx, %xmm0
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X32-NEXT: psllq $32, %xmm0
; X32-NEXT: movq %xmm0, (%eax)
; X32-NEXT: retl
;
@@ -19,7 +19,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind {
; X64: # %bb.0:
; X64-NEXT: shll $12, %edi
; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X64-NEXT: psllq $32, %xmm0
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
%tmp12 = shl i32 %a, 12
@@ -36,7 +36,7 @@ define <4 x float> @t2(<4 x float>* %P) nounwind {
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: xorps %xmm1, %xmm1
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
+; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; X32-NEXT: retl
;
@@ -44,7 +44,7 @@ define <4 x float> @t2(<4 x float>* %P) nounwind {
; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: xorps %xmm1, %xmm1
-; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
+; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index d3d51a35d337..63166bab0c61 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -189,7 +189,7 @@ define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
; SSE2-LABEL: sitofp_2i16_to_2f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
@@ -213,7 +213,7 @@ define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
; SSE2-LABEL: sitofp_8i16_to_2f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
@@ -246,7 +246,7 @@ define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
; SSE2-LABEL: sitofp_2i8_to_2f64:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-NEXT: psrad $24, %xmm0
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
@@ -271,7 +271,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
; SSE2-LABEL: sitofp_16i8_to_2f64:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-NEXT: psrad $24, %xmm0
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
@@ -3071,7 +3071,7 @@ define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-NEXT: psrad $24, %xmm0
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_set-6.ll b/llvm/test/CodeGen/X86/vec_set-6.ll
index 3f8997faf392..25de30696081 100644
--- a/llvm/test/CodeGen/X86/vec_set-6.ll
+++ b/llvm/test/CodeGen/X86/vec_set-6.ll
@@ -12,7 +12,7 @@ define <4 x float> @test(float %a, float %b, float %c) nounwind {
;
; X64-LABEL: test:
; X64: # %bb.0:
-; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 155990c3df4a..dbe8ed2f6ea5 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -275,7 +275,7 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
@@ -711,7 +711,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm5
; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-pack-256.ll b/llvm/test/CodeGen/X86/vector-pack-256.ll
index 6d1016ba1d73..d4833f9ac65b 100644
--- a/llvm/test/CodeGen/X86/vector-pack-256.ll
+++ b/llvm/test/CodeGen/X86/vector-pack-256.ll
@@ -228,7 +228,7 @@ define <16 x i16> @concat_trunc_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounw
; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index 47fc38dbefc6..30c1b4d9994a 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -297,7 +297,7 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
; AVX1-LABEL: trunc_v8i32_v8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
index b44b66eb1d87..17671c6a5591 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -19,8 +19,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -34,7 +33,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -48,7 +47,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -62,7 +61,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -107,8 +106,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -131,7 +129,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -155,7 +153,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -179,7 +177,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -203,7 +201,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -274,8 +272,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -315,7 +312,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -347,7 +344,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -380,7 +377,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -413,7 +410,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -527,8 +524,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -602,7 +598,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -650,7 +646,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -691,7 +687,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -732,7 +728,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -1691,17 +1687,16 @@ define i8 @test_v16i8(<16 x i8> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $8, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
@@ -1871,17 +1866,16 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $8, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
@@ -2091,17 +2085,16 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $8, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
@@ -2386,17 +2379,16 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm3, %xmm0
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $8, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index 408e0c9df0b7..5dc9b7033d82 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -286,7 +286,7 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
; AVX1-LABEL: trunc_v8i32_v8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index c6c43a4bd4be..fb019ffd99e9 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -291,7 +291,7 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
; AVX1-LABEL: trunc_v8i32_v8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index aa5c713d527b..19d9b159fd83 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1865,19 +1865,19 @@ define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(<16 x i8> %a) {
; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
; SSE: # %bb.0:
-; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE-NEXT: psrlq $8, %xmm0
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; AVX1-NEXT: vpsrlq $8, %xmm0, %xmm0
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; AVX2-SLOW-NEXT: vpsrlq $8, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; AVX2-SLOW-NEXT: retq
;
@@ -1893,7 +1893,7 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(
;
; XOP-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
; XOP: # %bb.0:
-; XOP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; XOP-NEXT: vpsrlq $8, %xmm0, %xmm0
; XOP-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; XOP-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
@@ -2459,12 +2459,10 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y)
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: movzbl (%rsi), %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index b5fc242daf45..7f84e23cf321 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -332,19 +332,19 @@ define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0124:
; SSE2: # %bb.0:
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0124:
; SSE3: # %bb.0:
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0124:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-NEXT: retq
;
@@ -377,19 +377,19 @@ define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0142:
; SSE2: # %bb.0:
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0142:
; SSE3: # %bb.0:
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0142:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
@@ -425,21 +425,21 @@ define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0412:
; SSE2: # %bb.0:
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0412:
; SSE3: # %bb.0:
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0412:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
@@ -451,12 +451,19 @@ define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v4i32_0412:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
-; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: shuffle_v4i32_0412:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i32_0412:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_0412:
; AVX512VL: # %bb.0:
@@ -469,21 +476,21 @@ define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_4012:
; SSE2: # %bb.0:
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_4012:
; SSE3: # %bb.0:
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_4012:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
@@ -672,22 +679,22 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_z4zz:
; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_z4zz:
; SSE3: # %bb.0:
+; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE3-NEXT: xorps %xmm1, %xmm1
-; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_z4zz:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
@@ -707,26 +714,23 @@ define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_zz4z:
; SSE2: # %bb.0:
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_zz4z:
; SSE3: # %bb.0:
-; SSE3-NEXT: xorps %xmm1, %xmm1
-; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
+; SSE3-NEXT: pxor %xmm0, %xmm0
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_zz4z:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_zz4z:
@@ -821,21 +825,21 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_z6zz:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_z6zz:
; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
-; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_z6zz:
; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
@@ -856,7 +860,7 @@ define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_0z23:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -864,7 +868,7 @@ define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
; SSE3-LABEL: shuffle_v4f32_0z23:
; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
@@ -872,7 +876,7 @@ define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
; SSSE3-LABEL: shuffle_v4f32_0z23:
; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
@@ -1058,29 +1062,29 @@ define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: shuffle_v4f32_0zz4:
; SSE2: # %bb.0:
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0zz4:
; SSE3: # %bb.0:
-; SSE3-NEXT: xorps %xmm2, %xmm2
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
-; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE3-NEXT: movaps %xmm2, %xmm0
+; SSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
+; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0zz4:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: xorps %xmm2, %xmm2
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
-; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_0zz4:
@@ -1142,27 +1146,27 @@ define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: shuffle_v4f32_0z24:
; SSE2: # %bb.0:
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0z24:
; SSE3: # %bb.0:
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE3-NEXT: xorps %xmm2, %xmm2
-; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
+; SSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
; SSE3-NEXT: movaps %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0z24:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSSE3-NEXT: xorps %xmm2, %xmm2
-; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
; SSSE3-NEXT: movaps %xmm2, %xmm0
; SSSE3-NEXT: retq
@@ -1347,22 +1351,22 @@ define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_z6zz:
; SSE2: # %bb.0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_z6zz:
; SSE3: # %bb.0:
+; SSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; SSE3-NEXT: xorps %xmm1, %xmm1
-; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_z6zz:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
@@ -1565,13 +1569,13 @@ define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_2456:
; SSE2: # %bb.0:
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_2456:
; SSE3: # %bb.0:
-; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[0,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
; SSE3-NEXT: retq
;
@@ -2100,19 +2104,19 @@ define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) {
; SSE2-LABEL: extract3_insert3_v4i32_0127:
; SSE2: # %bb.0:
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: extract3_insert3_v4i32_0127:
; SSE3: # %bb.0:
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: extract3_insert3_v4i32_0127:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 86423ce76065..c72d736960f9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1343,7 +1343,7 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_032dXXXX:
; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
@@ -2949,19 +2949,19 @@ define <8 x i16> @shuffle_v8i16_8012345u(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_9zzzuuuu(<8 x i16> %x) {
; SSE-LABEL: shuffle_v8i16_9zzzuuuu:
; SSE: # %bb.0:
-; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_9zzzuuuu:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i16_9zzzuuuu:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: retq
;
@@ -2972,7 +2972,7 @@ define <8 x i16> @shuffle_v8i16_9zzzuuuu(<8 x i16> %x) {
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_9zzzuuuu:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX512VL-SLOW-NEXT: vbroadcastss %xmm0, %xmm0
; AVX512VL-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512VL-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 729b1ad59429..1035dfa1e660 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3504,7 +3504,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_2
; AVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
@@ -3528,7 +3528,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_2
; XOPAVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; XOPAVX2-NEXT: retq
@@ -5075,8 +5075,8 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3
; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,u,u,16,17,24,25,18,19,26,27,20,21,28,29,22,23,u,u]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: retq
;
@@ -5100,8 +5100,8 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3
; XOPAVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,u,u,16,17,24,25,18,19,26,27,20,21,28,29,22,23,u,u]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 20, i32 1, i32 21, i32 2, i32 22, i32 3, i32 31, i32 8, i32 28, i32 9, i32 29, i32 10, i32 30, i32 11, i32 31>
@@ -5172,7 +5172,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2
; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23]
; AVX2-NEXT: retq
@@ -5197,7 +5197,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2
; XOPAVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23]
; XOPAVX2-NEXT: retq
@@ -5511,17 +5511,17 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3
; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,u,u,16,17,20,21,18,19,22,23,24,25,28,29,26,27,u,u]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-FAST-NEXT: retq
;
@@ -5545,9 +5545,9 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3
; XOPAVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 1, i32 3, i32 20, i32 22, i32 21, i32 31, i32 8, i32 10, i32 9, i32 11, i32 28, i32 30, i32 29, i32 31>
@@ -5808,7 +5808,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_1
; AVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23]
; AVX2-NEXT: retq
@@ -5833,7 +5833,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_1
; XOPAVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23]
; XOPAVX2-NEXT: retq
@@ -7364,17 +7364,30 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: PR34369:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: PR34369:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: PR34369:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: PR34369:
; AVX512VL: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index ae402f507ac9..4c8c8f6312f1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -130,7 +130,7 @@ define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
; AVX1-NEXT: retq
;
@@ -368,7 +368,7 @@ define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_08991abb:
; AVX1: # %bb.0:
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
@@ -1452,7 +1452,7 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
; AVX1-NEXT: retq
;
@@ -1719,7 +1719,7 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08991abb:
; AVX1: # %bb.0:
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
@@ -1757,8 +1757,8 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_091b2d3f:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index f34a541150ba..6420a62ff0ba 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -864,12 +864,19 @@ define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: combine_nested_undef_test15:
-; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: combine_nested_undef_test15:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_nested_undef_test15:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
ret <4 x i32> %2
@@ -2526,13 +2533,13 @@ define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_insertps4:
; SSE2: # %bb.0:
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_insertps4:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-NEXT: retq
;
@@ -2743,7 +2750,7 @@ define <4 x float> @PR30264(<4 x float> %x) {
; SSE2-LABEL: PR30264:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -2751,7 +2758,7 @@ define <4 x float> @PR30264(<4 x float> %x) {
; SSSE3-LABEL: PR30264:
; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
@@ -2991,11 +2998,9 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(<8 x i8>* %p0, i8* %p1, i8* %
; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: psraw $8, %xmm1
-; SSSE3-NEXT: pextrw $7, %xmm1, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: movsbl (%rsi), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; SSSE3-NEXT: movsbl (%rdx), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
index 957a3c544fd4..db414718b63f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -1214,22 +1214,55 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
;
define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
-; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
-; SSE: # %bb.0:
-; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx
-; SSE-NEXT: # kill: def $edx killed $edx def $rdx
-; SSE-NEXT: # kill: def $edi killed $edi def $rdi
-; SSE-NEXT: andl $3, %edi
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $3, %edx
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $3, %ecx
-; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: retq
+; SSE2-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; SSE2-NEXT: # kill: def $edx killed $edx def $rdx
+; SSE2-NEXT: # kill: def $edi killed $edi def $rdi
+; SSE2-NEXT: andl $3, %edi
+; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: andl $3, %edx
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: andl $3, %ecx
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: # kill: def $ecx killed $ecx def $rcx
+; SSSE3-NEXT: # kill: def $edx killed $edx def $rdx
+; SSSE3-NEXT: # kill: def $edi killed $edi def $rdi
+; SSSE3-NEXT: andl $3, %edi
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: andl $3, %edx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: andl $3, %ecx
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: # kill: def $ecx killed $ecx def $rcx
+; SSE41-NEXT: # kill: def $edx killed $edx def $rdx
+; SSE41-NEXT: # kill: def $edi killed $edi def $rdi
+; SSE41-NEXT: andl $3, %edi
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: andl $3, %edx
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: andl $3, %ecx
+; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero
+; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
;
; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
; AVX: # %bb.0:
@@ -1243,8 +1276,7 @@ define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float>
; AVX-NEXT: andl $3, %ecx
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero
; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%x0 = extractelement <4 x float> %x, i32 %i0
@@ -1292,8 +1324,8 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
@@ -1329,8 +1361,8 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 81a6f60f034f..a71c9497879a 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -155,7 +155,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -454,7 +454,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -613,7 +613,7 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -999,7 +999,7 @@ define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -1425,7 +1425,7 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1914,7 +1914,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -2273,7 +2273,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -2433,7 +2433,7 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2899,7 +2899,7 @@ define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -3286,7 +3286,7 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -3662,7 +3662,7 @@ define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -4049,7 +4049,7 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -4425,7 +4425,7 @@ define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -4812,7 +4812,7 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index d52e6195f8df..b6d1b0d16f7a 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -116,12 +116,10 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
;
; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i32_lshr:
@@ -339,7 +337,7 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; AVX1-LABEL: trunc8i32_8i16:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2035,13 +2033,14 @@ define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, <8 x i16>
; AVX1-LABEL: store_merge_split:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: shlq $4, %rdi
; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi)
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index eb4f60dc1450..d97b9a359b1a 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -1754,7 +1754,7 @@ define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable
; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT: psrlq $8, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; SSE2-NEXT: movdqa %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index aba908f34696..8890076242e9 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -569,7 +569,7 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) {
; SSE2-NEXT: movd %edi, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vshift-4.ll b/llvm/test/CodeGen/X86/vshift-4.ll
index 7d6fe0688c84..08f553e05aee 100644
--- a/llvm/test/CodeGen/X86/vshift-4.ll
+++ b/llvm/test/CodeGen/X86/vshift-4.ll
@@ -58,19 +58,15 @@ define void @shift2a(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
; X32-LABEL: shift2a:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X32-NEXT: xorps %xmm2, %xmm2
-; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-NEXT: pslld %xmm2, %xmm0
+; X32-NEXT: psrlq $32, %xmm1
+; X32-NEXT: pslld %xmm1, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift2a:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT: xorps %xmm2, %xmm2
-; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X64-NEXT: pslld %xmm2, %xmm0
+; X64-NEXT: psrlq $32, %xmm1
+; X64-NEXT: pslld %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
entry:
@@ -84,19 +80,15 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
; X32-LABEL: shift2b:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X32-NEXT: xorps %xmm2, %xmm2
-; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-NEXT: pslld %xmm2, %xmm0
+; X32-NEXT: psrlq $32, %xmm1
+; X32-NEXT: pslld %xmm1, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift2b:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT: xorps %xmm2, %xmm2
-; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X64-NEXT: pslld %xmm2, %xmm0
+; X64-NEXT: psrlq $32, %xmm1
+; X64-NEXT: pslld %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
entry:
@@ -110,19 +102,15 @@ define void @shift2c(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
; X32-LABEL: shift2c:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X32-NEXT: xorps %xmm2, %xmm2
-; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-NEXT: pslld %xmm2, %xmm0
+; X32-NEXT: psrlq $32, %xmm1
+; X32-NEXT: pslld %xmm1, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift2c:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT: xorps %xmm2, %xmm2
-; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X64-NEXT: pslld %xmm2, %xmm0
+; X64-NEXT: psrlq $32, %xmm1
+; X64-NEXT: pslld %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
entry:
@@ -136,8 +124,7 @@ define void @shift3a(<8 x i16> %val, <8 x i16>* %dst, <8 x i16> %amt) nounwind {
; X32-LABEL: shift3a:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
; X32-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: psllw %xmm1, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
@@ -145,8 +132,7 @@ define void @shift3a(<8 x i16> %val, <8 x i16>* %dst, <8 x i16> %amt) nounwind {
;
; X64-LABEL: shift3a:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X64-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
; X64-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: psllw %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
More information about the llvm-commits
mailing list