[llvm] 842a672 - [X86] LowerTRUNCATE - improve handling during type legalization to PACKSS/PACKUS patterns
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 11 02:39:54 PDT 2023
Author: Simon Pilgrim
Date: 2023-07-11T10:39:44+01:00
New Revision: 842a6728d95057b5b048453f75139be6c7c18f93
URL: https://github.com/llvm/llvm-project/commit/842a6728d95057b5b048453f75139be6c7c18f93
DIFF: https://github.com/llvm/llvm-project/commit/842a6728d95057b5b048453f75139be6c7c18f93.diff
LOG: [X86] LowerTRUNCATE - improve handling during type legalization to PACKSS/PACKUS patterns
Extend coverage for lowering wide vector types during type legalization to allow us to use PACKSS/PACKUS patterns instead of dropping down to shuffle lowering.
First step towards avoiding premature folds of TRUNCATE to PACKSS/PACKUS nodes as described on Issue #63710 - which causes a large number of regressions on D152928 - we will next need to tweak the TRUNCATE widening in ReplaceNodeResults
Differential Revision: https://reviews.llvm.org/D154592
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
llvm/test/CodeGen/X86/bitcast-vector-bool.ll
llvm/test/CodeGen/X86/cast-vsel.ll
llvm/test/CodeGen/X86/masked_store.ll
llvm/test/CodeGen/X86/masked_store_trunc.ll
llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
llvm/test/CodeGen/X86/vector-trunc-math.ll
llvm/test/CodeGen/X86/vector-trunc-packus.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bf8acd38eaea2a..475dac3f4b2dc3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1239,7 +1239,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
// In the customized shift lowering, the legal v4i32/v2i64 cases
// in AVX2 will be recognized.
@@ -1480,9 +1488,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
- setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom);
+
setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
@@ -1802,7 +1812,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
@@ -2338,10 +2347,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
}
-
- setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}
if (Subtarget.hasAMXTILE()) {
@@ -22869,6 +22874,84 @@ static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL,
return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
}
+/// This function lowers a vector truncation of 'extended sign-bits' or
+/// 'extended zero-bits' values.
+/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
+static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
+ const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT SrcVT = In.getSimpleValueType();
+ MVT DstSVT = DstVT.getVectorElementType();
+ MVT SrcSVT = SrcVT.getVectorElementType();
+ if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
+ (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
+ return SDValue();
+
+ unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
+ unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
+ unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
+
+ // Truncate with PACKUS if we are truncating a vector with leading zero
+ // bits that extend all the way to the packed/truncated value. Pre-SSE41
+ // we can only use PACKUSWB.
+ KnownBits Known = DAG.computeKnownBits(In);
+ if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
+ if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG,
+ Subtarget))
+ return V;
+
+ // Truncate with PACKSS if we are truncating a vector with sign-bits
+ // that extend all the way to the packed/truncated value.
+ if ((NumSrcEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
+ if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG,
+ Subtarget))
+ return V;
+
+ return SDValue();
+}
+
+/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
+/// X86ISD::PACKUS/X86ISD::PACKSS operations.
+static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT SrcVT = In.getSimpleValueType();
+ MVT DstSVT = DstVT.getVectorElementType();
+ MVT SrcSVT = SrcVT.getVectorElementType();
+ unsigned NumElems = DstVT.getVectorNumElements();
+ if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
+ (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
+ NumElems >= 8))
+ return SDValue();
+
+ // SSSE3's pshufb results in less instructions in the cases below.
+ if (Subtarget.hasSSSE3() && NumElems == 8) {
+ if (SrcSVT == MVT::i16)
+ return SDValue();
+ if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
+ return SDValue();
+ }
+
+ // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
+ // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
+ // truncate 2 x v4i32 to v8i16.
+ if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
+ return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
+
+ if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
+ return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
+
+ // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
+ if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
+ MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
+ return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
+ }
+
+ return SDValue();
+}
+
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -22955,8 +23038,6 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
- unsigned InNumEltBits = InVT.getScalarSizeInBits();
-
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
@@ -22964,7 +23045,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(InVT)) {
if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
- VT.is128BitVector()) {
+ VT.is128BitVector() && Subtarget.hasAVX512()) {
assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
"Unexpected subtarget!");
// The default behavior is to truncate one step, concatenate, and then
@@ -22981,6 +23062,15 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
}
+ // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
+ if (!Subtarget.hasAVX512()) {
+ if (SDValue SignPack =
+ LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
+ return SignPack;
+
+ return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
+ }
+
// Otherwise let default legalization handle it.
return SDValue();
}
@@ -22988,28 +23078,12 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
if (VT.getVectorElementType() == MVT::i1)
return LowerTruncateVecI1(Op, DAG, Subtarget);
- unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
- unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
-
// Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
// concat from subvectors to use VPTRUNC etc.
- if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG)) {
- // Truncate with PACKUS if we are truncating a vector with leading zero
- // bits that extend all the way to the packed/truncated value. Pre-SSE41
- // we can only use PACKUSWB.
- KnownBits Known = DAG.computeKnownBits(In);
- if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
- if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG,
- Subtarget))
- return V;
-
- // Truncate with PACKSS if we are truncating a vector with sign-bits
- // that extend all the way to the packed/truncated value.
- if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
- if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG,
- Subtarget))
- return V;
- }
+ if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
+ if (SDValue SignPack =
+ LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
+ return SignPack;
// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget.hasAVX512()) {
@@ -23068,27 +23142,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBitcast(MVT::v8i16, In);
}
- SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
- DAG.getIntPtrConstant(0, DL));
- SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
- DAG.getIntPtrConstant(4, DL));
-
- // The PSHUFB mask:
- static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
-
- OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
- OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
-
- OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
- OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
-
- OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
- OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
-
- // The MOVLHPS Mask:
- static const int ShufMask2[] = {0, 1, 4, 5};
- SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
- return DAG.getBitcast(MVT::v8i16, res);
+ return Subtarget.hasSSE41()
+ ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
+ : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
}
if (VT == MVT::v16i8 && InVT == MVT::v16i16)
@@ -53152,6 +53208,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
/// legalization the truncation will be translated into a BUILD_VECTOR with each
/// element that is extracted from a vector and then truncated, and it is
///
diff icult to do this optimization based on them.
+/// TODO: Remove this and just use LowerTruncateVecPack.
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT OutVT = N->getValueType(0);
@@ -53200,6 +53257,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
/// This function transforms vector truncation of 'extended sign-bits' or
/// 'extended zero-bits' values.
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
+/// TODO: Remove this and just use LowerTruncateVecPackWithSignBits.
static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
index 2f7f9c554c0ef5..e5e9fe3605d3fb 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -451,13 +451,13 @@ define i8 @v8i32_or_and(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d,
define i8 @v8i32_or_vselect(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; SSE2-SSSE3-LABEL: v8i32_or_vselect:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: por %xmm5, %xmm1
+; SSE2-SSSE3-NEXT: por %xmm4, %xmm0
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4
-; SSE2-SSSE3-NEXT: por %xmm0, %xmm4
-; SSE2-SSSE3-NEXT: packsswb %xmm4, %xmm4
-; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
@@ -514,10 +514,8 @@ define i8 @v8i32_or_vselect(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3, i1 %a4) {
; SSE2-SSSE3-LABEL: v8i32_or_select:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: testb $1, %dil
; SSE2-SSSE3-NEXT: jne .LBB7_1
; SSE2-SSSE3-NEXT: # %bb.2:
@@ -528,7 +526,9 @@ define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm0
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSE2-SSSE3-NEXT: .LBB7_3:
+; SSE2-SSSE3-NEXT: por %xmm3, %xmm7
; SSE2-SSSE3-NEXT: por %xmm2, %xmm6
+; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6
; SSE2-SSSE3-NEXT: por %xmm0, %xmm6
; SSE2-SSSE3-NEXT: packsswb %xmm6, %xmm6
; SSE2-SSSE3-NEXT: pmovmskb %xmm6, %eax
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
index 4816615db255ce..6acce84645e889 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -356,21 +356,18 @@ define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) {
;
; AVX2-LABEL: v16i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm5
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm3
+; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: vzeroupper
@@ -450,21 +447,18 @@ define i16 @v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x floa
;
; AVX2-LABEL: v16f32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vcmpltps %ymm4, %ymm6, %ymm4
-; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm6
-; AVX2-NEXT: vandps %xmm6, %xmm2, %xmm2
-; AVX2-NEXT: vcmpltps %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vandps %xmm6, %xmm3, %xmm3
-; AVX2-NEXT: vandps %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vandps %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vcmpltps %ymm5, %ymm7, %ymm2
+; AVX2-NEXT: vcmpltps %ymm4, %ymm6, %ymm3
+; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index 042368c1f310d2..90fe8276171d19 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -720,20 +720,16 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
define i1 @trunc_v8i64_cmp(<8 x i64> %a0) nounwind {
; SSE2-SSSE3-LABEL: trunc_v8i64_cmp:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-SSSE3-NEXT: psllw $15, %xmm2
-; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm2
-; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT: pslld $16, %xmm2
+; SSE2-SSSE3-NEXT: psrad $16, %xmm2
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT: pslld $16, %xmm0
+; SSE2-SSSE3-NEXT: psrad $16, %xmm0
+; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: psllw $15, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
; SSE2-SSSE3-NEXT: cmpb $-1, %al
; SSE2-SSSE3-NEXT: sete %al
; SSE2-SSSE3-NEXT: retq
@@ -1471,16 +1467,12 @@ define i8 @PR59526(<8 x i32> %a, <8 x i32> %b, ptr %mask) {
; SSE-NEXT: pcmpeqd %xmm2, %xmm0
; SSE-NEXT: pcmpeqd %xmm3, %xmm1
; SSE-NEXT: movdqu (%rdi), %xmm2
-; SSE-NEXT: movdqu 16(%rdi), %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE-NEXT: pand %xmm0, %xmm5
-; SSE-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE-NEXT: pand %xmm1, %xmm4
-; SSE-NEXT: packssdw %xmm4, %xmm5
-; SSE-NEXT: pmovmskb %xmm5, %eax
-; SSE-NEXT: testl %eax, %eax
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: movdqu 16(%rdi), %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm0, %xmm2
+; SSE-NEXT: pmovmskb %xmm2, %eax
+; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll
index fb0470fd3b4559..019eae7c428c6c 100644
--- a/llvm/test/CodeGen/X86/cast-vsel.ll
+++ b/llvm/test/CodeGen/X86/cast-vsel.ll
@@ -318,10 +318,10 @@ define dso_local void @example25() nounwind {
; SSE41-NEXT: andps %xmm2, %xmm4
; SSE41-NEXT: cmpltps dd+4112(%rax), %xmm3
; SSE41-NEXT: andps %xmm1, %xmm3
-; SSE41-NEXT: andps %xmm0, %xmm4
; SSE41-NEXT: andps %xmm0, %xmm3
+; SSE41-NEXT: psrld $31, %xmm4
; SSE41-NEXT: movaps %xmm3, dj+4112(%rax)
-; SSE41-NEXT: movaps %xmm4, dj+4096(%rax)
+; SSE41-NEXT: movdqa %xmm4, dj+4096(%rax)
; SSE41-NEXT: addq $32, %rax
; SSE41-NEXT: jne .LBB5_1
; SSE41-NEXT: # %bb.2: # %for.end
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 9ae1e695482c78..ea4ce2681a801a 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -5608,118 +5608,98 @@ define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask)
define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigger.ptr, ptr %val.ptr, ptr %dst) {
; SSE2-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
; SSE2: ## %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm6
+; SSE2-NEXT: movdqa 32(%rdi), %xmm7
+; SSE2-NEXT: movdqa 64(%rdi), %xmm8
; SSE2-NEXT: movl 80(%rsi), %eax
-; SSE2-NEXT: movl 64(%rsi), %ecx
-; SSE2-NEXT: movl 48(%rsi), %r8d
-; SSE2-NEXT: movl 32(%rsi), %r9d
-; SSE2-NEXT: movl 16(%rsi), %r10d
+; SSE2-NEXT: movl 64(%rsi), %r8d
+; SSE2-NEXT: movl 48(%rsi), %r9d
+; SSE2-NEXT: movl 32(%rsi), %r10d
+; SSE2-NEXT: movl 16(%rsi), %r11d
; SSE2-NEXT: movdqa 80(%rsi), %xmm0
; SSE2-NEXT: movdqa 64(%rsi), %xmm1
; SSE2-NEXT: movdqa 48(%rsi), %xmm2
; SSE2-NEXT: movdqa 32(%rsi), %xmm3
; SSE2-NEXT: movdqa 16(%rsi), %xmm4
; SSE2-NEXT: movdqa (%rsi), %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3]
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
-; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
-; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = mem[0,2,2,3]
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
-; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm10[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3]
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
-; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3]
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
-; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; SSE2-NEXT: pmovmskb %xmm9, %r11d
-; SSE2-NEXT: andl $21845, %r11d ## imm = 0x5555
-; SSE2-NEXT: pmovmskb %xmm7, %edi
-; SSE2-NEXT: andl $85, %edi
-; SSE2-NEXT: shll $16, %edi
-; SSE2-NEXT: orl %r11d, %edi
-; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: packssdw 48(%rdi), %xmm7
+; SSE2-NEXT: packssdw 16(%rdi), %xmm6
+; SSE2-NEXT: packsswb %xmm7, %xmm6
+; SSE2-NEXT: packssdw 80(%rdi), %xmm8
+; SSE2-NEXT: packsswb %xmm8, %xmm8
+; SSE2-NEXT: pmovmskb %xmm6, %edi
+; SSE2-NEXT: andl $21845, %edi ## imm = 0x5555
+; SSE2-NEXT: pmovmskb %xmm8, %ecx
+; SSE2-NEXT: andl $85, %ecx
+; SSE2-NEXT: shll $16, %ecx
+; SSE2-NEXT: orl %edi, %ecx
+; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: jne LBB31_1
; SSE2-NEXT: ## %bb.2: ## %else
-; SSE2-NEXT: testb $2, %dil
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: jne LBB31_3
; SSE2-NEXT: LBB31_4: ## %else2
-; SSE2-NEXT: testb $4, %dil
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: jne LBB31_5
; SSE2-NEXT: LBB31_6: ## %else4
-; SSE2-NEXT: testb $8, %dil
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: jne LBB31_7
; SSE2-NEXT: LBB31_8: ## %else6
-; SSE2-NEXT: testb $16, %dil
+; SSE2-NEXT: testb $16, %cl
; SSE2-NEXT: jne LBB31_9
; SSE2-NEXT: LBB31_10: ## %else8
-; SSE2-NEXT: testb $32, %dil
+; SSE2-NEXT: testb $32, %cl
; SSE2-NEXT: jne LBB31_11
; SSE2-NEXT: LBB31_12: ## %else10
-; SSE2-NEXT: testb $64, %dil
+; SSE2-NEXT: testb $64, %cl
; SSE2-NEXT: jne LBB31_13
; SSE2-NEXT: LBB31_14: ## %else12
-; SSE2-NEXT: testb %dil, %dil
+; SSE2-NEXT: testb %cl, %cl
; SSE2-NEXT: js LBB31_15
; SSE2-NEXT: LBB31_16: ## %else14
-; SSE2-NEXT: testl $256, %edi ## imm = 0x100
+; SSE2-NEXT: testl $256, %ecx ## imm = 0x100
; SSE2-NEXT: jne LBB31_17
; SSE2-NEXT: LBB31_18: ## %else16
-; SSE2-NEXT: testl $512, %edi ## imm = 0x200
+; SSE2-NEXT: testl $512, %ecx ## imm = 0x200
; SSE2-NEXT: jne LBB31_19
; SSE2-NEXT: LBB31_20: ## %else18
-; SSE2-NEXT: testl $1024, %edi ## imm = 0x400
+; SSE2-NEXT: testl $1024, %ecx ## imm = 0x400
; SSE2-NEXT: jne LBB31_21
; SSE2-NEXT: LBB31_22: ## %else20
-; SSE2-NEXT: testl $2048, %edi ## imm = 0x800
+; SSE2-NEXT: testl $2048, %ecx ## imm = 0x800
; SSE2-NEXT: jne LBB31_23
; SSE2-NEXT: LBB31_24: ## %else22
-; SSE2-NEXT: testl $4096, %edi ## imm = 0x1000
+; SSE2-NEXT: testl $4096, %ecx ## imm = 0x1000
; SSE2-NEXT: jne LBB31_25
; SSE2-NEXT: LBB31_26: ## %else24
-; SSE2-NEXT: testl $8192, %edi ## imm = 0x2000
+; SSE2-NEXT: testl $8192, %ecx ## imm = 0x2000
; SSE2-NEXT: jne LBB31_27
; SSE2-NEXT: LBB31_28: ## %else26
-; SSE2-NEXT: testl $16384, %edi ## imm = 0x4000
+; SSE2-NEXT: testl $16384, %ecx ## imm = 0x4000
; SSE2-NEXT: jne LBB31_29
; SSE2-NEXT: LBB31_30: ## %else28
-; SSE2-NEXT: testw %di, %di
+; SSE2-NEXT: testw %cx, %cx
; SSE2-NEXT: js LBB31_31
; SSE2-NEXT: LBB31_32: ## %else30
-; SSE2-NEXT: testl $65536, %edi ## imm = 0x10000
+; SSE2-NEXT: testl $65536, %ecx ## imm = 0x10000
; SSE2-NEXT: jne LBB31_33
; SSE2-NEXT: LBB31_34: ## %else32
-; SSE2-NEXT: testl $131072, %edi ## imm = 0x20000
+; SSE2-NEXT: testl $131072, %ecx ## imm = 0x20000
; SSE2-NEXT: jne LBB31_35
; SSE2-NEXT: LBB31_36: ## %else34
-; SSE2-NEXT: testl $262144, %edi ## imm = 0x40000
+; SSE2-NEXT: testl $262144, %ecx ## imm = 0x40000
; SSE2-NEXT: jne LBB31_37
; SSE2-NEXT: LBB31_38: ## %else36
-; SSE2-NEXT: testl $524288, %edi ## imm = 0x80000
+; SSE2-NEXT: testl $524288, %ecx ## imm = 0x80000
; SSE2-NEXT: jne LBB31_39
; SSE2-NEXT: LBB31_40: ## %else38
-; SSE2-NEXT: testl $1048576, %edi ## imm = 0x100000
+; SSE2-NEXT: testl $1048576, %ecx ## imm = 0x100000
; SSE2-NEXT: jne LBB31_41
; SSE2-NEXT: LBB31_42: ## %else40
-; SSE2-NEXT: testl $2097152, %edi ## imm = 0x200000
+; SSE2-NEXT: testl $2097152, %ecx ## imm = 0x200000
; SSE2-NEXT: jne LBB31_43
; SSE2-NEXT: LBB31_44: ## %else42
-; SSE2-NEXT: testl $4194304, %edi ## imm = 0x400000
+; SSE2-NEXT: testl $4194304, %ecx ## imm = 0x400000
; SSE2-NEXT: je LBB31_46
; SSE2-NEXT: LBB31_45: ## %cond.store43
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -5738,123 +5718,123 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
; SSE2-NEXT: LBB31_1: ## %cond.store
; SSE2-NEXT: movl (%rsi), %esi
; SSE2-NEXT: movl %esi, (%rdx)
-; SSE2-NEXT: testb $2, %dil
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: je LBB31_4
; SSE2-NEXT: LBB31_3: ## %cond.store1
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
; SSE2-NEXT: movd %xmm6, %esi
; SSE2-NEXT: movl %esi, 4(%rdx)
-; SSE2-NEXT: testb $4, %dil
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: je LBB31_6
; SSE2-NEXT: LBB31_5: ## %cond.store3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
; SSE2-NEXT: movd %xmm6, %esi
; SSE2-NEXT: movl %esi, 8(%rdx)
-; SSE2-NEXT: testb $8, %dil
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: je LBB31_8
; SSE2-NEXT: LBB31_7: ## %cond.store5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
; SSE2-NEXT: movd %xmm5, %esi
; SSE2-NEXT: movl %esi, 12(%rdx)
-; SSE2-NEXT: testb $16, %dil
+; SSE2-NEXT: testb $16, %cl
; SSE2-NEXT: je LBB31_10
; SSE2-NEXT: LBB31_9: ## %cond.store7
-; SSE2-NEXT: movl %r10d, 16(%rdx)
-; SSE2-NEXT: testb $32, %dil
+; SSE2-NEXT: movl %r11d, 16(%rdx)
+; SSE2-NEXT: testb $32, %cl
; SSE2-NEXT: je LBB31_12
; SSE2-NEXT: LBB31_11: ## %cond.store9
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1]
; SSE2-NEXT: movd %xmm5, %esi
; SSE2-NEXT: movl %esi, 20(%rdx)
-; SSE2-NEXT: testb $64, %dil
+; SSE2-NEXT: testb $64, %cl
; SSE2-NEXT: je LBB31_14
; SSE2-NEXT: LBB31_13: ## %cond.store11
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
; SSE2-NEXT: movd %xmm5, %esi
; SSE2-NEXT: movl %esi, 24(%rdx)
-; SSE2-NEXT: testb %dil, %dil
+; SSE2-NEXT: testb %cl, %cl
; SSE2-NEXT: jns LBB31_16
; SSE2-NEXT: LBB31_15: ## %cond.store13
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
; SSE2-NEXT: movd %xmm4, %esi
; SSE2-NEXT: movl %esi, 28(%rdx)
-; SSE2-NEXT: testl $256, %edi ## imm = 0x100
+; SSE2-NEXT: testl $256, %ecx ## imm = 0x100
; SSE2-NEXT: je LBB31_18
; SSE2-NEXT: LBB31_17: ## %cond.store15
-; SSE2-NEXT: movl %r9d, 32(%rdx)
-; SSE2-NEXT: testl $512, %edi ## imm = 0x200
+; SSE2-NEXT: movl %r10d, 32(%rdx)
+; SSE2-NEXT: testl $512, %ecx ## imm = 0x200
; SSE2-NEXT: je LBB31_20
; SSE2-NEXT: LBB31_19: ## %cond.store17
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
; SSE2-NEXT: movd %xmm4, %esi
; SSE2-NEXT: movl %esi, 36(%rdx)
-; SSE2-NEXT: testl $1024, %edi ## imm = 0x400
+; SSE2-NEXT: testl $1024, %ecx ## imm = 0x400
; SSE2-NEXT: je LBB31_22
; SSE2-NEXT: LBB31_21: ## %cond.store19
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; SSE2-NEXT: movd %xmm4, %esi
; SSE2-NEXT: movl %esi, 40(%rdx)
-; SSE2-NEXT: testl $2048, %edi ## imm = 0x800
+; SSE2-NEXT: testl $2048, %ecx ## imm = 0x800
; SSE2-NEXT: je LBB31_24
; SSE2-NEXT: LBB31_23: ## %cond.store21
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
; SSE2-NEXT: movd %xmm3, %esi
; SSE2-NEXT: movl %esi, 44(%rdx)
-; SSE2-NEXT: testl $4096, %edi ## imm = 0x1000
+; SSE2-NEXT: testl $4096, %ecx ## imm = 0x1000
; SSE2-NEXT: je LBB31_26
; SSE2-NEXT: LBB31_25: ## %cond.store23
-; SSE2-NEXT: movl %r8d, 48(%rdx)
-; SSE2-NEXT: testl $8192, %edi ## imm = 0x2000
+; SSE2-NEXT: movl %r9d, 48(%rdx)
+; SSE2-NEXT: testl $8192, %ecx ## imm = 0x2000
; SSE2-NEXT: je LBB31_28
; SSE2-NEXT: LBB31_27: ## %cond.store25
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
; SSE2-NEXT: movd %xmm3, %esi
; SSE2-NEXT: movl %esi, 52(%rdx)
-; SSE2-NEXT: testl $16384, %edi ## imm = 0x4000
+; SSE2-NEXT: testl $16384, %ecx ## imm = 0x4000
; SSE2-NEXT: je LBB31_30
; SSE2-NEXT: LBB31_29: ## %cond.store27
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm3, %esi
; SSE2-NEXT: movl %esi, 56(%rdx)
-; SSE2-NEXT: testw %di, %di
+; SSE2-NEXT: testw %cx, %cx
; SSE2-NEXT: jns LBB31_32
; SSE2-NEXT: LBB31_31: ## %cond.store29
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
; SSE2-NEXT: movd %xmm2, %esi
; SSE2-NEXT: movl %esi, 60(%rdx)
-; SSE2-NEXT: testl $65536, %edi ## imm = 0x10000
+; SSE2-NEXT: testl $65536, %ecx ## imm = 0x10000
; SSE2-NEXT: je LBB31_34
; SSE2-NEXT: LBB31_33: ## %cond.store31
-; SSE2-NEXT: movl %ecx, 64(%rdx)
-; SSE2-NEXT: testl $131072, %edi ## imm = 0x20000
+; SSE2-NEXT: movl %r8d, 64(%rdx)
+; SSE2-NEXT: testl $131072, %ecx ## imm = 0x20000
; SSE2-NEXT: je LBB31_36
; SSE2-NEXT: LBB31_35: ## %cond.store33
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; SSE2-NEXT: movd %xmm2, %ecx
-; SSE2-NEXT: movl %ecx, 68(%rdx)
-; SSE2-NEXT: testl $262144, %edi ## imm = 0x40000
+; SSE2-NEXT: movd %xmm2, %esi
+; SSE2-NEXT: movl %esi, 68(%rdx)
+; SSE2-NEXT: testl $262144, %ecx ## imm = 0x40000
; SSE2-NEXT: je LBB31_38
; SSE2-NEXT: LBB31_37: ## %cond.store35
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE2-NEXT: movd %xmm2, %ecx
-; SSE2-NEXT: movl %ecx, 72(%rdx)
-; SSE2-NEXT: testl $524288, %edi ## imm = 0x80000
+; SSE2-NEXT: movd %xmm2, %esi
+; SSE2-NEXT: movl %esi, 72(%rdx)
+; SSE2-NEXT: testl $524288, %ecx ## imm = 0x80000
; SSE2-NEXT: je LBB31_40
; SSE2-NEXT: LBB31_39: ## %cond.store37
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE2-NEXT: movd %xmm1, %ecx
-; SSE2-NEXT: movl %ecx, 76(%rdx)
-; SSE2-NEXT: testl $1048576, %edi ## imm = 0x100000
+; SSE2-NEXT: movd %xmm1, %esi
+; SSE2-NEXT: movl %esi, 76(%rdx)
+; SSE2-NEXT: testl $1048576, %ecx ## imm = 0x100000
; SSE2-NEXT: je LBB31_42
; SSE2-NEXT: LBB31_41: ## %cond.store39
; SSE2-NEXT: movl %eax, 80(%rdx)
-; SSE2-NEXT: testl $2097152, %edi ## imm = 0x200000
+; SSE2-NEXT: testl $2097152, %ecx ## imm = 0x200000
; SSE2-NEXT: je LBB31_44
; SSE2-NEXT: LBB31_43: ## %cond.store41
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movl %eax, 84(%rdx)
-; SSE2-NEXT: testl $4194304, %edi ## imm = 0x400000
+; SSE2-NEXT: testl $4194304, %ecx ## imm = 0x400000
; SSE2-NEXT: jne LBB31_45
; SSE2-NEXT: jmp LBB31_46
;
@@ -5878,6 +5858,9 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
; SSE4-NEXT: .cfi_offset %r14, -32
; SSE4-NEXT: .cfi_offset %r15, -24
; SSE4-NEXT: .cfi_offset %rbp, -16
+; SSE4-NEXT: movdqa (%rdi), %xmm1
+; SSE4-NEXT: movdqa 32(%rdi), %xmm2
+; SSE4-NEXT: movdqa 64(%rdi), %xmm0
; SSE4-NEXT: movl 92(%rsi), %eax
; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; SSE4-NEXT: movl 88(%rsi), %eax
@@ -5900,33 +5883,14 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; SSE4-NEXT: movl 52(%rsi), %eax
; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT: pxor %xmm0, %xmm0
-; SSE4-NEXT: pxor %xmm1, %xmm1
-; SSE4-NEXT: pcmpgtd 48(%rdi), %xmm1
-; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
-; SSE4-NEXT: pxor %xmm2, %xmm2
-; SSE4-NEXT: pcmpgtd 32(%rdi), %xmm2
-; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
-; SSE4-NEXT: packusdw %xmm1, %xmm2
-; SSE4-NEXT: pxor %xmm1, %xmm1
-; SSE4-NEXT: pcmpgtd 16(%rdi), %xmm1
-; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
-; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: pcmpgtd (%rdi), %xmm3
-; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7]
-; SSE4-NEXT: packusdw %xmm1, %xmm3
-; SSE4-NEXT: packusdw %xmm2, %xmm3
-; SSE4-NEXT: pxor %xmm1, %xmm1
-; SSE4-NEXT: pcmpgtd 80(%rdi), %xmm1
-; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
-; SSE4-NEXT: pxor %xmm2, %xmm2
-; SSE4-NEXT: pcmpgtd 64(%rdi), %xmm2
-; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
-; SSE4-NEXT: packusdw %xmm1, %xmm2
-; SSE4-NEXT: packusdw %xmm2, %xmm2
-; SSE4-NEXT: pmovmskb %xmm3, %eax
+; SSE4-NEXT: packssdw 48(%rdi), %xmm2
+; SSE4-NEXT: packssdw 16(%rdi), %xmm1
+; SSE4-NEXT: packsswb %xmm2, %xmm1
+; SSE4-NEXT: packssdw 80(%rdi), %xmm0
+; SSE4-NEXT: packsswb %xmm0, %xmm0
+; SSE4-NEXT: pmovmskb %xmm1, %eax
; SSE4-NEXT: andl $21845, %eax ## imm = 0x5555
-; SSE4-NEXT: pmovmskb %xmm2, %edi
+; SSE4-NEXT: pmovmskb %xmm0, %edi
; SSE4-NEXT: andl $85, %edi
; SSE4-NEXT: shll $16, %edi
; SSE4-NEXT: orl %eax, %edi
@@ -6171,19 +6135,23 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtd 32(%rdi), %ymm3, %ymm4
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtd 64(%rdi), %ymm3, %ymm5
-; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm6
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
-; AVX2-NEXT: vpslld $31, %ymm6, %ymm6
-; AVX2-NEXT: vpmaskmovd %ymm0, %ymm6, (%rdx)
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
+; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm5
+; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
+; AVX2-NEXT: vpcmpgtd %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vpacksswb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,1,3]
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %ymm3, %ymm3
+; AVX2-NEXT: vpmaskmovd %ymm0, %ymm3, (%rdx)
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx)
-; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdx)
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 3de5e4d771ed65..d650a08200a9bb 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -215,17 +215,13 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm2, %xmm0
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm5
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 8fcb5f627f8ef2..57588985094ad1 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -313,63 +313,59 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pxor %xmm9, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm8
-; SSE2-NEXT: pand %xmm11, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: pand %xmm11, %xmm10
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm7, %xmm8
-; SSE2-NEXT: por %xmm2, %xmm8
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,0,2,2]
-; SSE2-NEXT: movdqa %xmm10, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pand %xmm12, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm7, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2]
-; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: pand %xmm10, %xmm1
+; SSE2-NEXT: pxor %xmm7, %xmm10
+; SSE2-NEXT: por %xmm1, %xmm10
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm9, %xmm12
; SSE2-NEXT: pcmpgtd %xmm11, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pand %xmm12, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm7, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
+; SSE2-NEXT: pand %xmm12, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,2]
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pand %xmm10, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT: pand %xmm11, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm7, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE2-NEXT: pand %xmm9, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm7, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2]
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: packssdw %xmm3, %xmm1
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: pxor %xmm7, %xmm5
; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
@@ -403,42 +399,42 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: .LBB1_16: # %else14
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB1_1: # %cond.store
-; SSE2-NEXT: movd %xmm0, %ecx
+; SSE2-NEXT: movd %xmm1, %ecx
; SSE2-NEXT: movw %cx, (%rdi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB1_4
; SSE2-NEXT: .LBB1_3: # %cond.store1
-; SSE2-NEXT: pextrw $1, %xmm0, %ecx
+; SSE2-NEXT: pextrw $1, %xmm1, %ecx
; SSE2-NEXT: movw %cx, 2(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB1_6
; SSE2-NEXT: .LBB1_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: pextrw $2, %xmm1, %ecx
; SSE2-NEXT: movw %cx, 4(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB1_8
; SSE2-NEXT: .LBB1_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm0, %ecx
+; SSE2-NEXT: pextrw $3, %xmm1, %ecx
; SSE2-NEXT: movw %cx, 6(%rdi)
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je .LBB1_10
; SSE2-NEXT: .LBB1_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
+; SSE2-NEXT: pextrw $4, %xmm1, %ecx
; SSE2-NEXT: movw %cx, 8(%rdi)
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je .LBB1_12
; SSE2-NEXT: .LBB1_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm0, %ecx
+; SSE2-NEXT: pextrw $5, %xmm1, %ecx
; SSE2-NEXT: movw %cx, 10(%rdi)
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je .LBB1_14
; SSE2-NEXT: .LBB1_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm0, %ecx
+; SSE2-NEXT: pextrw $6, %xmm1, %ecx
; SSE2-NEXT: movw %cx, 12(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB1_16
; SSE2-NEXT: .LBB1_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm0, %eax
+; SSE2-NEXT: pextrw $7, %xmm1, %eax
; SSE2-NEXT: movw %ax, 14(%rdi)
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 795bd084730f19..5e4c4477a90b51 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -970,14 +970,13 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm3
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm3
; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpacksswb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
@@ -1629,7 +1628,7 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; SSE2OR3-LABEL: psubus_8i64_max:
; SSE2OR3: # %bb.0: # %vector.ph
; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE2OR3-NEXT: movdqa %xmm2, %xmm7
+; SSE2OR3-NEXT: movdqa %xmm4, %xmm7
; SSE2OR3-NEXT: pxor %xmm5, %xmm7
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
@@ -1639,53 +1638,49 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8
; SSE2OR3-NEXT: pand %xmm9, %xmm8
; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm7
-; SSE2OR3-NEXT: pand %xmm8, %xmm2
+; SSE2OR3-NEXT: pand %xmm8, %xmm4
; SSE2OR3-NEXT: pxor %xmm7, %xmm8
-; SSE2OR3-NEXT: por %xmm2, %xmm8
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
-; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2OR3-NEXT: movdqa %xmm1, %xmm8
-; SSE2OR3-NEXT: pxor %xmm5, %xmm8
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2OR3-NEXT: por %xmm4, %xmm8
+; SSE2OR3-NEXT: movdqa %xmm3, %xmm4
+; SSE2OR3-NEXT: pxor %xmm5, %xmm4
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
; SSE2OR3-NEXT: movdqa %xmm6, %xmm10
; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8
-; SSE2OR3-NEXT: pand %xmm10, %xmm8
-; SSE2OR3-NEXT: pand %xmm8, %xmm1
-; SSE2OR3-NEXT: pxor %xmm7, %xmm8
-; SSE2OR3-NEXT: por %xmm1, %xmm8
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3]
-; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2OR3-NEXT: movdqa %xmm4, %xmm2
-; SSE2OR3-NEXT: pxor %xmm5, %xmm2
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2OR3-NEXT: pand %xmm10, %xmm4
+; SSE2OR3-NEXT: pand %xmm4, %xmm3
+; SSE2OR3-NEXT: pxor %xmm7, %xmm4
+; SSE2OR3-NEXT: por %xmm3, %xmm4
+; SSE2OR3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm8[0,2]
+; SSE2OR3-NEXT: pslld $16, %xmm4
+; SSE2OR3-NEXT: psrad $16, %xmm4
+; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
+; SSE2OR3-NEXT: pxor %xmm5, %xmm3
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE2OR3-NEXT: pand %xmm9, %xmm3
+; SSE2OR3-NEXT: pand %xmm3, %xmm2
+; SSE2OR3-NEXT: pxor %xmm7, %xmm3
+; SSE2OR3-NEXT: por %xmm2, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
+; SSE2OR3-NEXT: pxor %xmm5, %xmm2
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
+; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2OR3-NEXT: pand %xmm9, %xmm2
-; SSE2OR3-NEXT: pand %xmm2, %xmm4
-; SSE2OR3-NEXT: pxor %xmm7, %xmm2
-; SSE2OR3-NEXT: por %xmm4, %xmm2
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE2OR3-NEXT: movdqa %xmm3, %xmm4
-; SSE2OR3-NEXT: pxor %xmm5, %xmm4
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
-; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2OR3-NEXT: pand %xmm6, %xmm4
-; SSE2OR3-NEXT: pxor %xmm4, %xmm7
-; SSE2OR3-NEXT: pand %xmm3, %xmm4
-; SSE2OR3-NEXT: por %xmm7, %xmm4
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2OR3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
-; SSE2OR3-NEXT: psubusw %xmm3, %xmm0
+; SSE2OR3-NEXT: pand %xmm6, %xmm2
+; SSE2OR3-NEXT: pxor %xmm2, %xmm7
+; SSE2OR3-NEXT: pand %xmm1, %xmm2
+; SSE2OR3-NEXT: por %xmm7, %xmm2
+; SSE2OR3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2OR3-NEXT: pslld $16, %xmm2
+; SSE2OR3-NEXT: psrad $16, %xmm2
+; SSE2OR3-NEXT: packssdw %xmm4, %xmm2
+; SSE2OR3-NEXT: psubusw %xmm2, %xmm0
; SSE2OR3-NEXT: retq
;
; SSE41-LABEL: psubus_8i64_max:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index db675103fd1df7..7f3dfffa2ad132 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -373,20 +373,16 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind {
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-16, %esp
; X86-SSE2-NEXT: subl $16, %esp
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X86-SSE2-NEXT: psllw $15, %xmm2
-; X86-SSE2-NEXT: packsswb %xmm2, %xmm2
-; X86-SSE2-NEXT: pmovmskb %xmm2, %eax
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X86-SSE2-NEXT: pslld $16, %xmm0
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],mem[0,2]
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
+; X86-SSE2-NEXT: packssdw %xmm2, %xmm0
+; X86-SSE2-NEXT: psllw $15, %xmm0
+; X86-SSE2-NEXT: packsswb %xmm0, %xmm0
+; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpb $-1, %al
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: movl %ebp, %esp
@@ -395,20 +391,16 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind {
;
; X64-SSE2-LABEL: trunc_v8i64_v8i1:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X64-SSE2-NEXT: psllw $15, %xmm2
-; X64-SSE2-NEXT: packsswb %xmm2, %xmm2
-; X64-SSE2-NEXT: pmovmskb %xmm2, %eax
+; X64-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; X64-SSE2-NEXT: pslld $16, %xmm2
+; X64-SSE2-NEXT: psrad $16, %xmm2
+; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-SSE2-NEXT: pslld $16, %xmm0
+; X64-SSE2-NEXT: psrad $16, %xmm0
+; X64-SSE2-NEXT: packssdw %xmm2, %xmm0
+; X64-SSE2-NEXT: psllw $15, %xmm0
+; X64-SSE2-NEXT: packsswb %xmm0, %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpb $-1, %al
; X64-SSE2-NEXT: sete %al
; X64-SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index f668c5ddd447d6..c43c82689634eb 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -371,19 +371,15 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind {
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-16, %esp
; X86-SSE2-NEXT: subl $16, %esp
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X86-SSE2-NEXT: psllw $15, %xmm2
-; X86-SSE2-NEXT: pmovmskb %xmm2, %eax
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X86-SSE2-NEXT: pslld $16, %xmm0
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],mem[0,2]
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
+; X86-SSE2-NEXT: packssdw %xmm2, %xmm0
+; X86-SSE2-NEXT: psllw $15, %xmm0
+; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: movl %ebp, %esp
@@ -392,19 +388,15 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind {
;
; X64-SSE2-LABEL: trunc_v8i64_v8i1:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X64-SSE2-NEXT: psllw $15, %xmm2
-; X64-SSE2-NEXT: pmovmskb %xmm2, %eax
+; X64-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; X64-SSE2-NEXT: pslld $16, %xmm2
+; X64-SSE2-NEXT: psrad $16, %xmm2
+; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-SSE2-NEXT: pslld $16, %xmm0
+; X64-SSE2-NEXT: psrad $16, %xmm0
+; X64-SSE2-NEXT: packssdw %xmm2, %xmm0
+; X64-SSE2-NEXT: psllw $15, %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; X64-SSE2-NEXT: setne %al
; X64-SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index c51942aa6f3539..3e68366ffe7239 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -507,20 +507,16 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind {
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-16, %esp
; X86-SSE2-NEXT: subl $16, %esp
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X86-SSE2-NEXT: psllw $15, %xmm2
-; X86-SSE2-NEXT: packsswb %xmm2, %xmm2
-; X86-SSE2-NEXT: pmovmskb %xmm2, %eax
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X86-SSE2-NEXT: pslld $16, %xmm0
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],mem[0,2]
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
+; X86-SSE2-NEXT: packssdw %xmm2, %xmm0
+; X86-SSE2-NEXT: psllw $15, %xmm0
+; X86-SSE2-NEXT: packsswb %xmm0, %xmm0
+; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: testb %al, %al
; X86-SSE2-NEXT: setnp %al
; X86-SSE2-NEXT: movl %ebp, %esp
@@ -529,20 +525,16 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind {
;
; X64-SSE2-LABEL: trunc_v8i64_v8i1:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X64-SSE2-NEXT: psllw $15, %xmm2
-; X64-SSE2-NEXT: packsswb %xmm2, %xmm2
-; X64-SSE2-NEXT: pmovmskb %xmm2, %eax
+; X64-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; X64-SSE2-NEXT: pslld $16, %xmm2
+; X64-SSE2-NEXT: psrad $16, %xmm2
+; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-SSE2-NEXT: pslld $16, %xmm0
+; X64-SSE2-NEXT: psrad $16, %xmm0
+; X64-SSE2-NEXT: packssdw %xmm2, %xmm0
+; X64-SSE2-NEXT: psllw $15, %xmm0
+; X64-SSE2-NEXT: packsswb %xmm0, %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: testb %al, %al
; X64-SSE2-NEXT: setnp %al
; X64-SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 3f935b290208fd..7fa64520f93149 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -73,21 +73,17 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_add_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: paddq %xmm6, %xmm2
-; SSE-NEXT: paddq %xmm7, %xmm3
-; SSE-NEXT: paddq %xmm4, %xmm0
; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: paddq %xmm7, %xmm3
+; SSE-NEXT: paddq %xmm6, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_v8i64_v8i16:
@@ -515,17 +511,13 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
@@ -854,21 +846,17 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_sub_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: psubq %xmm6, %xmm2
-; SSE-NEXT: psubq %xmm7, %xmm3
-; SSE-NEXT: psubq %xmm4, %xmm0
; SSE-NEXT: psubq %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: psubq %xmm4, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: psubq %xmm7, %xmm3
+; SSE-NEXT: psubq %xmm6, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_v8i64_v8i16:
@@ -1266,17 +1254,13 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
@@ -1665,29 +1649,21 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: pmullw %xmm6, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; SSE-NEXT: pslld $16, %xmm6
+; SSE-NEXT: psrad $16, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; SSE-NEXT: pslld $16, %xmm4
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm6, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v8i64_v8i16:
@@ -2194,17 +2170,13 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
@@ -2606,21 +2578,17 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_and_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: andps %xmm5, %xmm1
+; SSE-NEXT: andps %xmm4, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: andps %xmm7, %xmm3
+; SSE-NEXT: andps %xmm6, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_v8i64_v8i16:
@@ -2968,18 +2936,14 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
@@ -3305,21 +3269,17 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_xor_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: pxor %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: xorps %xmm5, %xmm1
+; SSE-NEXT: xorps %xmm4, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: xorps %xmm7, %xmm3
+; SSE-NEXT: xorps %xmm6, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_v8i64_v8i16:
@@ -3667,18 +3627,14 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
+; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
@@ -4004,21 +3960,17 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_or_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm4, %xmm0
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: orps %xmm5, %xmm1
+; SSE-NEXT: orps %xmm4, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: orps %xmm7, %xmm3
+; SSE-NEXT: orps %xmm6, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_v8i64_v8i16:
@@ -4366,18 +4318,14 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v8i64_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
+; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index 1babaf9175ea3c..50185d849b926c 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -1505,120 +1505,112 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" {
; SSE2-SSSE3-LABEL: trunc_packus_v8i64_v8i16:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm8
-; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm2
+; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5
+; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3
-; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm6
+; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm8
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183]
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm10
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
-; SSE2-SSSE3-NEXT: por %xmm11, %xmm1
-; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: por %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm2
-; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7
; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183]
+; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10
; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm11, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm8
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3
; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: por %xmm8, %xmm2
-; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8
-; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm8
+; SSE2-SSSE3-NEXT: por %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm11, %xmm3
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8
+; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: por %xmm8, %xmm3
+; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10
; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm11, %xmm8
-; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6
+; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5
; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8
-; SSE2-SSSE3-NEXT: por %xmm6, %xmm8
-; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm5, %xmm8
+; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-SSSE3-NEXT: por %xmm6, %xmm5
-; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3
-; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5
-; SSE2-SSSE3-NEXT: por %xmm3, %xmm5
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3
-; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
+; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm5, %xmm6
+; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0
+; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6
+; SSE2-SSSE3-NEXT: por %xmm0, %xmm6
+; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm0, %xmm4
+; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4
+; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm6, %xmm0
+; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-SSSE3-NEXT: por %xmm7, %xmm3
-; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3
-; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm4
-; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4
; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
-; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-SSSE3-NEXT: por %xmm7, %xmm4
-; SSE2-SSSE3-NEXT: pand %xmm8, %xmm4
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm4
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-SSSE3-NEXT: por %xmm5, %xmm6
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6
-; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE2-SSSE3-NEXT: por %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-SSSE3-NEXT: por %xmm4, %xmm5
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2]
+; SSE2-SSSE3-NEXT: pslld $16, %xmm3
+; SSE2-SSSE3-NEXT: psrad $16, %xmm3
+; SSE2-SSSE3-NEXT: pslld $16, %xmm0
+; SSE2-SSSE3-NEXT: psrad $16, %xmm0
+; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm0
; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index a57d61b1c83c57..b65316763aad77 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -1090,67 +1090,63 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
; SSE2-SSSE3-LABEL: trunc_usat_v8i64_v8i16:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm4
-; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm1
-; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm6
-; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm7
+; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm6
+; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0
+; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm1
+; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm5
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0
-; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm7
+; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm7
+; SSE2-SSSE3-NEXT: por %xmm0, %xmm7
+; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0
+; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm6
-; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm0
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
; SSE2-SSSE3-NEXT: por %xmm6, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2]
+; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm6
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6
-; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6
-; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7
-; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSE2-SSSE3-NEXT: por %xmm7, %xmm6
-; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm7
-; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm7
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm7
-; SSE2-SSSE3-NEXT: pand %xmm9, %xmm7
-; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4
-; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm7
-; SSE2-SSSE3-NEXT: por %xmm4, %xmm7
-; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSE2-SSSE3-NEXT: pand %xmm1, %xmm4
-; SSE2-SSSE3-NEXT: por %xmm5, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6
+; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSE2-SSSE3-NEXT: por %xmm5, %xmm6
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm5
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5
+; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5
+; SSE2-SSSE3-NEXT: por %xmm4, %xmm5
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
+; SSE2-SSSE3-NEXT: pslld $16, %xmm5
+; SSE2-SSSE3-NEXT: psrad $16, %xmm5
+; SSE2-SSSE3-NEXT: pslld $16, %xmm0
+; SSE2-SSSE3-NEXT: psrad $16, %xmm0
+; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm0
; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i64_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 129e535b4a841d..ee7d7ca7e38fdf 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -159,17 +159,13 @@ entry:
define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
; SSE2-SSSE3-LABEL: trunc8i64_8i16:
; SSE2-SSSE3: # %bb.0: # %entry
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT: pslld $16, %xmm2
+; SSE2-SSSE3-NEXT: psrad $16, %xmm2
+; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT: pslld $16, %xmm0
+; SSE2-SSSE3-NEXT: psrad $16, %xmm0
+; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0
; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc8i64_8i16:
More information about the llvm-commits
mailing list