[llvm] 1584e55 - [X86] canonicalizeShuffleWithBinOps - handle general unaryshuffle(binop(x,c)) patterns not just xor(x,-1)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 4 03:01:15 PST 2021
Author: Simon Pilgrim
Date: 2021-03-04T10:44:38Z
New Revision: 1584e55a2602cd9fe0db059b06a217822ffac7cd
URL: https://github.com/llvm/llvm-project/commit/1584e55a2602cd9fe0db059b06a217822ffac7cd
DIFF: https://github.com/llvm/llvm-project/commit/1584e55a2602cd9fe0db059b06a217822ffac7cd.diff
LOG: [X86] canonicalizeShuffleWithBinOps - handle general unaryshuffle(binop(x,c)) patterns not just xor(x,-1)
Generalize the shuffle(not(x)) -> not(shuffle(x)) fold to handle any binop with 0/-1.
Hopefully we can further generalize to help push target unary/binary shuffles through binops similar to what we do in DAGCombiner::visitVECTOR_SHUFFLE
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/combine-movmsk.ll
llvm/test/CodeGen/X86/combine-sdiv.ll
llvm/test/CodeGen/X86/masked_compressstore.ll
llvm/test/CodeGen/X86/masked_expandload.ll
llvm/test/CodeGen/X86/masked_load.ll
llvm/test/CodeGen/X86/masked_store.ll
llvm/test/CodeGen/X86/sadd_sat_vec.ll
llvm/test/CodeGen/X86/sar_fold64.ll
llvm/test/CodeGen/X86/sdiv_fix.ll
llvm/test/CodeGen/X86/ssub_sat_vec.ll
llvm/test/CodeGen/X86/vec_int_to_fp.ll
llvm/test/CodeGen/X86/vec_saddo.ll
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
llvm/test/CodeGen/X86/vector-pcmp.ll
llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4cf21cbd9810..51f886ee45e1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36553,12 +36553,15 @@ static SDValue combineX86ShufflesRecursively(
// Remove unused/repeated shuffle source ops.
resolveTargetShuffleInputsAndMask(Ops, Mask);
- // Handle the all undef/zero cases early.
+ // Handle the all undef/zero/ones cases early.
if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
return DAG.getUNDEF(Root.getValueType());
if (all_of(Mask, [](int Idx) { return Idx < 0; }))
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
SDLoc(Root));
+ if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
+ none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+ return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
assert(!Ops.empty() && "Shuffle with no inputs detected");
HasVariableMask |= IsOpVariableMask;
@@ -36887,28 +36890,53 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
return SDValue();
}
-// Canonicalize UNARYSHUFFLE(XOR(X,-1)) -> XOR(UNARYSHUFFLE(X),-1) to
-// help expose the 'NOT' pattern further up the DAG.
-// TODO: This might be beneficial for any binop with a 'splattable' operand.
+// Canonicalize SHUFFLE(BINOP(X,C)) -> BINOP(SHUFFLE(X),SHUFFLE(C)).
static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
const SDLoc &DL) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT ShuffleVT = N.getValueType();
+
+ auto IsMergeableWithShuffle = [](SDValue Op) {
+ // AllZeros/AllOnes constants are freely shuffled.
+ return ISD::isBuildVectorAllOnes(Op.getNode()) ||
+ ISD::isBuildVectorAllZeros(Op.getNode());
+ };
+ auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
+ // Ensure we only shuffle whole vector src elements, unless its logical
+ // binops where we can more aggressively move shuffles from dst to src.
+ return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
+ (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
+ };
+
unsigned Opc = N.getOpcode();
switch (Opc) {
+ case X86ISD::VBROADCAST:
case X86ISD::MOVDDUP:
case X86ISD::PSHUFD: {
- SDValue N0 = N.getOperand(0);
- if (N->isOnlyUserOf(N.getOperand(0).getNode())) {
- if (SDValue Not = IsNOT(N0, DAG, /*OneUse*/ true)) {
- Not = DAG.getBitcast(ShuffleVT, Not);
- Not = Opc == X86ISD::MOVDDUP
- ? DAG.getNode(Opc, DL, ShuffleVT, Not)
- : DAG.getNode(Opc, DL, ShuffleVT, Not, N.getOperand(1));
- EVT IntVT = Not.getValueType().changeTypeToInteger();
- SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
- Not = DAG.getBitcast(IntVT, Not);
- Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
- return DAG.getBitcast(ShuffleVT, Not);
+ if (N.getOperand(0).getValueType() == ShuffleVT &&
+ N->isOnlyUserOf(N.getOperand(0).getNode())) {
+ SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
+ unsigned SrcOpcode = N0.getOpcode();
+ if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
+ SDValue Op00 = N0.getOperand(0);
+ SDValue Op01 = N0.getOperand(1);
+ if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
+ SDValue LHS, RHS;
+ Op00 = DAG.getBitcast(ShuffleVT, Op00);
+ Op01 = DAG.getBitcast(ShuffleVT, Op01);
+ if (N.getNumOperands() == 2) {
+ LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
+ RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
+ } else {
+ LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
+ RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
+ }
+ EVT OpVT = N0.getValueType();
+ return DAG.getBitcast(ShuffleVT,
+ DAG.getNode(SrcOpcode, DL, OpVT,
+ DAG.getBitcast(OpVT, LHS),
+ DAG.getBitcast(OpVT, RHS)));
+ }
}
}
break;
diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll
index b93b747fb9d4..892475d07ade 100644
--- a/llvm/test/CodeGen/X86/combine-movmsk.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk.ll
@@ -65,9 +65,7 @@ define i1 @movmskps_allof_bitcast_v2f64(<2 x double> %a0) {
define i1 @pmovmskb_noneof_bitcast_v2i64(<2 x i64> %a0) {
; SSE2-LABEL: pmovmskb_noneof_bitcast_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: movmskps %xmm0, %eax
; SSE2-NEXT: testl %eax, %eax
; SSE2-NEXT: sete %al
@@ -97,9 +95,7 @@ define i1 @pmovmskb_noneof_bitcast_v2i64(<2 x i64> %a0) {
define i1 @pmovmskb_allof_bitcast_v2i64(<2 x i64> %a0) {
; SSE2-LABEL: pmovmskb_allof_bitcast_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: movmskps %xmm0, %eax
; SSE2-NEXT: cmpl $15, %eax
; SSE2-NEXT: sete %al
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 75251cef8a97..877dcbc6e4d2 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -1998,12 +1998,13 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
; SSE2-NEXT: psrad $2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[2,3]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: psubd %xmm2, %xmm3
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
index 70011bfe8ac0..e3533678aa23 100644
--- a/llvm/test/CodeGen/X86/masked_compressstore.ll
+++ b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -603,10 +603,10 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i
define void @compressstore_v2f32_v2i32(float* %base, <2 x float> %V, <2 x i32> %trigger) {
; SSE2-LABEL: compressstore_v2f32_v2i32:
; SSE2: ## %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
-; SSE2-NEXT: movmskpd %xmm1, %eax
+; SSE2-NEXT: movmskpd %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB2_1
; SSE2-NEXT: ## %bb.2: ## %else
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 53afc0ae30e8..ccae7e18f361 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -1117,10 +1117,10 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x i32> %trigger) {
; SSE2-LABEL: expandload_v2f32_v2i1:
; SSE2: ## %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
-; SSE2-NEXT: movmskpd %xmm1, %eax
+; SSE2-NEXT: movmskpd %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB4_1
; SSE2-NEXT: ## %bb.2: ## %else
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 7d1e2956b28b..d6d08ac58125 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -783,10 +783,10 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, <
define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
; SSE2-LABEL: load_v2f32_v2i32:
; SSE2: ## %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
-; SSE2-NEXT: movmskpd %xmm0, %eax
+; SSE2-NEXT: movmskpd %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB7_1
; SSE2-NEXT: ## %bb.2: ## %else
@@ -885,10 +885,10 @@ define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2
define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, <2 x float>* %addr) {
; SSE2-LABEL: load_v2f32_v2i32_undef:
; SSE2: ## %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
-; SSE2-NEXT: movmskpd %xmm0, %eax
+; SSE2-NEXT: movmskpd %xmm1, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: ## implicit-def: $xmm0
; SSE2-NEXT: jne LBB8_1
@@ -2188,10 +2188,10 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i6
define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
; SSE2-LABEL: load_v2i32_v2i32:
; SSE2: ## %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
-; SSE2-NEXT: movmskpd %xmm0, %eax
+; SSE2-NEXT: movmskpd %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB17_1
; SSE2-NEXT: ## %bb.2: ## %else
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 36a27937dcbb..89955afd3fb6 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -192,10 +192,10 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou
define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; SSE2-LABEL: store_v2f32_v2i32:
; SSE2: ## %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
-; SSE2-NEXT: movmskpd %xmm0, %eax
+; SSE2-NEXT: movmskpd %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB3_1
; SSE2-NEXT: ## %bb.2: ## %else
@@ -1126,10 +1126,10 @@ define void @store_v1i32_v1i32(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %
define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
; SSE2-LABEL: store_v2i32_v2i32:
; SSE2: ## %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
-; SSE2-NEXT: movmskpd %xmm0, %eax
+; SSE2-NEXT: movmskpd %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB10_1
; SSE2-NEXT: ## %bb.2: ## %else
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index 9eacc459b3f2..94db1afec8de 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -1190,20 +1190,20 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i64:
@@ -1221,20 +1221,20 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pandn %xmm0, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pandn %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i64:
@@ -1324,47 +1324,47 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
; SSE2-NEXT: pand %xmm9, %xmm6
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: paddq %xmm3, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: movdqa %xmm5, %xmm6
; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm9, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm9, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
;
@@ -1384,47 +1384,47 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSSE3-NEXT: pand %xmm8, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
; SSSE3-NEXT: por %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
; SSSE3-NEXT: pxor %xmm6, %xmm7
-; SSSE3-NEXT: movdqa %xmm7, %xmm2
-; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm7, %xmm5
+; SSSE3-NEXT: pandn %xmm0, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
; SSSE3-NEXT: pand %xmm9, %xmm6
; SSSE3-NEXT: por %xmm6, %xmm0
; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
; SSSE3-NEXT: paddq %xmm3, %xmm1
; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSSE3-NEXT: pandn %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm9, %xmm4
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: retq
;
@@ -1540,20 +1540,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pand %xmm12, %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
; SSE2-NEXT: por %xmm9, %xmm10
-; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3]
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm13
; SSE2-NEXT: pxor %xmm10, %xmm13
; SSE2-NEXT: movdqa %xmm13, %xmm12
; SSE2-NEXT: pandn %xmm0, %xmm12
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
+; SSE2-NEXT: pandn %xmm9, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
+; SSE2-NEXT: pand %xmm10, %xmm4
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: pand %xmm13, %xmm0
; SSE2-NEXT: por %xmm12, %xmm0
@@ -1570,20 +1570,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pand %xmm14, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm12
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pxor %xmm12, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm12
-; SSE2-NEXT: pandn %xmm1, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm12, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm12, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm13
+; SSE2-NEXT: pandn %xmm1, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
+; SSE2-NEXT: pandn %xmm9, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm12, %xmm4
+; SSE2-NEXT: pand %xmm10, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm13, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm12
; SSE2-NEXT: pxor %xmm8, %xmm12
; SSE2-NEXT: paddq %xmm6, %xmm2
@@ -1597,19 +1597,19 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pand %xmm13, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pandn %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm6
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
+; SSE2-NEXT: pandn %xmm9, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm12, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
@@ -1623,19 +1623,19 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pand %xmm6, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm9
-; SSE2-NEXT: pandn %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm6
-; SSE2-NEXT: por %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE2-NEXT: pandn %xmm9, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm11
+; SSE2-NEXT: pand %xmm10, %xmm11
+; SSE2-NEXT: por %xmm11, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i64:
@@ -1654,20 +1654,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pand %xmm12, %xmm9
; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
; SSSE3-NEXT: por %xmm9, %xmm10
-; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm11, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm13, %xmm13
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm13
; SSSE3-NEXT: pxor %xmm10, %xmm13
; SSSE3-NEXT: movdqa %xmm13, %xmm12
; SSSE3-NEXT: pandn %xmm0, %xmm12
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pandn %xmm9, %xmm0
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pandn %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT: pand %xmm11, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
+; SSSE3-NEXT: pand %xmm10, %xmm4
; SSSE3-NEXT: por %xmm4, %xmm0
; SSSE3-NEXT: pand %xmm13, %xmm0
; SSSE3-NEXT: por %xmm12, %xmm0
@@ -1684,20 +1684,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pand %xmm14, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm12
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm12, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm12
-; SSSE3-NEXT: pandn %xmm1, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm5
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: por %xmm12, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: pxor %xmm12, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm13
+; SSSE3-NEXT: pandn %xmm1, %xmm13
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pandn %xmm9, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm12, %xmm4
+; SSSE3-NEXT: pand %xmm10, %xmm4
+; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm13, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm12
; SSSE3-NEXT: pxor %xmm8, %xmm12
; SSSE3-NEXT: paddq %xmm6, %xmm2
@@ -1711,19 +1711,19 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pand %xmm13, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pandn %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pandn %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm6
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm4
+; SSSE3-NEXT: pandn %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pandn %xmm9, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm12, %xmm5
+; SSSE3-NEXT: pand %xmm10, %xmm5
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: por %xmm4, %xmm2
; SSSE3-NEXT: movdqa %xmm3, %xmm4
; SSSE3-NEXT: pxor %xmm8, %xmm4
@@ -1737,19 +1737,19 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pand %xmm6, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm9
-; SSSE3-NEXT: pandn %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm6
-; SSSE3-NEXT: por %xmm6, %xmm3
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: por %xmm5, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm4
+; SSSE3-NEXT: pandn %xmm3, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pandn %xmm9, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm11
+; SSSE3-NEXT: pand %xmm10, %xmm11
+; SSSE3-NEXT: por %xmm11, %xmm3
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: por %xmm4, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i64:
diff --git a/llvm/test/CodeGen/X86/sar_fold64.ll b/llvm/test/CodeGen/X86/sar_fold64.ll
index 8b4a8f6c940e..f597efc99b7d 100644
--- a/llvm/test/CodeGen/X86/sar_fold64.ll
+++ b/llvm/test/CodeGen/X86/sar_fold64.ll
@@ -102,26 +102,26 @@ define <4 x i32> @all_sign_bit_ashr_vec1(<4 x i32> %x) {
; SSE-LABEL: all_sign_bit_ashr_vec1:
; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: psubd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: all_sign_bit_ashr_vec1:
; AVX1: # %bb.0:
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: all_sign_bit_ashr_vec1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: retq
%and = and <4 x i32> %x, <i32 1, i32 1, i32 1 , i32 1>
%sub = sub <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %and
@@ -162,26 +162,26 @@ define <4 x i32> @all_sign_bit_ashr_vec3(<4 x i32> %x) {
; SSE-LABEL: all_sign_bit_ashr_vec3:
; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: all_sign_bit_ashr_vec3:
; AVX1: # %bb.0:
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: all_sign_bit_ashr_vec3:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: retq
%and = and <4 x i32> %x, <i32 1, i32 1, i32 1 , i32 1>
%add = add <4 x i32> %and, <i32 -1, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 30794873cb49..a5471415e6e6 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -443,65 +443,65 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: pcmpgtd %xmm1, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movdqa %xmm1, %xmm4
-; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X64-NEXT: movq %xmm4, %rcx
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; X64-NEXT: movdqa %xmm1, %xmm3
+; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT: movq %xmm3, %rcx
+; X64-NEXT: pxor %xmm5, %xmm5
+; X64-NEXT: pcmpgtd %xmm0, %xmm5
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; X64-NEXT: psllq $31, %xmm0
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rcx
; X64-NEXT: movq %rax, %r8
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
-; X64-NEXT: movq %xmm2, %rcx
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm2, %rax
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; X64-NEXT: movq %xmm3, %rcx
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X64-NEXT: movq %xmm3, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rcx
; X64-NEXT: movq %rax, %r10
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pcmpgtd %xmm3, %xmm2
-; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-NEXT: movq %xmm3, %rdi
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pcmpgtd %xmm1, %xmm2
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT: pxor %xmm3, %xmm3
+; X64-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X64-NEXT: movq %xmm4, %rdi
+; X64-NEXT: pxor %xmm5, %xmm5
+; X64-NEXT: pcmpgtd %xmm1, %xmm5
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
; X64-NEXT: psllq $31, %xmm1
; X64-NEXT: movq %xmm1, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rdi
; X64-NEXT: movq %rax, %r9
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; X64-NEXT: movq %xmm2, %rsi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm2, %rax
+; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; X64-NEXT: movq %xmm4, %rsi
+; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; X64-NEXT: movq %xmm4, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rsi
-; X64-NEXT: movq %r11, %xmm2
+; X64-NEXT: movq %r11, %xmm4
; X64-NEXT: movq %rcx, %xmm5
; X64-NEXT: pxor %xmm6, %xmm6
-; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; X64-NEXT: pcmpeqd %xmm6, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2]
-; X64-NEXT: pand %xmm2, %xmm5
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pcmpgtd %xmm4, %xmm2
+; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; X64-NEXT: pcmpeqd %xmm6, %xmm4
+; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
+; X64-NEXT: pand %xmm4, %xmm5
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
; X64-NEXT: pxor %xmm4, %xmm4
-; X64-NEXT: pcmpgtd %xmm0, %xmm4
+; X64-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: pcmpgtd %xmm0, %xmm2
; X64-NEXT: movq %r8, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; X64-NEXT: pxor %xmm2, %xmm4
-; X64-NEXT: movq %r10, %xmm2
-; X64-NEXT: pandn %xmm4, %xmm5
-; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT: pxor %xmm4, %xmm2
+; X64-NEXT: movq %r10, %xmm4
+; X64-NEXT: pandn %xmm2, %xmm5
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; X64-NEXT: movdqa %xmm5, %xmm2
; X64-NEXT: pandn %xmm0, %xmm2
; X64-NEXT: pcmpeqd %xmm4, %xmm4
@@ -514,13 +514,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: pcmpeqd %xmm6, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2]
; X64-NEXT: pand %xmm2, %xmm5
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pcmpgtd %xmm3, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
+; X64-NEXT: pxor %xmm3, %xmm3
+; X64-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X64-NEXT: pcmpgtd %xmm1, %xmm6
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; X64-NEXT: pxor %xmm2, %xmm1
-; X64-NEXT: pandn %xmm1, %xmm5
+; X64-NEXT: pxor %xmm3, %xmm6
+; X64-NEXT: pandn %xmm6, %xmm5
; X64-NEXT: movq %r9, %xmm1
; X64-NEXT: movq %rax, %xmm2
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 484a8bba8fda..0f434e741212 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -1257,17 +1257,16 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i64:
@@ -1296,17 +1295,16 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3
-; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm0, %xmm2
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pandn %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i64:
@@ -1423,47 +1421,47 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-NEXT: movdqa %xmm7, %xmm4
; SSE2-NEXT: pandn %xmm0, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
; SSE2-NEXT: pandn %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT: pand %xmm10, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: psubq %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT: psubq %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSE2-NEXT: pandn %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
;
@@ -1496,47 +1494,47 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSSE3-NEXT: movdqa %xmm7, %xmm4
; SSSE3-NEXT: pandn %xmm0, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
; SSSE3-NEXT: pandn %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT: pand %xmm10, %xmm2
-; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm10, %xmm5
+; SSSE3-NEXT: por %xmm5, %xmm0
; SSSE3-NEXT: pand %xmm7, %xmm0
; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: psubq %xmm3, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
+; SSSE3-NEXT: psubq %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pxor %xmm8, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSSE3-NEXT: pandn %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSSE3-NEXT: pandn %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm4
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm10, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: retq
;
@@ -1682,21 +1680,21 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm11, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE2-NEXT: por %xmm12, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm12, %xmm13
+; SSE2-NEXT: pxor %xmm10, %xmm13
+; SSE2-NEXT: movdqa %xmm13, %xmm12
; SSE2-NEXT: pandn %xmm0, %xmm12
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
; SSE2-NEXT: pandn %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3]
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT: pand %xmm10, %xmm13
-; SSE2-NEXT: por %xmm13, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm10, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm13, %xmm0
; SSE2-NEXT: por %xmm12, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm12
; SSE2-NEXT: pxor %xmm8, %xmm12
@@ -1721,16 +1719,16 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm12, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm12
-; SSE2-NEXT: pandn %xmm1, %xmm12
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm13
+; SSE2-NEXT: pandn %xmm1, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
; SSE2-NEXT: pandn %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm12, %xmm5
; SSE2-NEXT: pand %xmm10, %xmm5
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm12, %xmm1
+; SSE2-NEXT: por %xmm13, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm12
; SSE2-NEXT: pxor %xmm8, %xmm12
; SSE2-NEXT: psubq %xmm6, %xmm2
@@ -1756,10 +1754,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
; SSE2-NEXT: pandn %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm12, %xmm6
; SSE2-NEXT: pand %xmm10, %xmm6
; SSE2-NEXT: por %xmm6, %xmm2
; SSE2-NEXT: pand %xmm5, %xmm2
@@ -1789,11 +1787,11 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pxor %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm11
+; SSE2-NEXT: pand %xmm10, %xmm11
+; SSE2-NEXT: por %xmm11, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: por %xmm5, %xmm3
; SSE2-NEXT: retq
@@ -1821,21 +1819,21 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
; SSSE3-NEXT: pand %xmm11, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSSE3-NEXT: por %xmm12, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm12, %xmm13
+; SSSE3-NEXT: pxor %xmm10, %xmm13
+; SSSE3-NEXT: movdqa %xmm13, %xmm12
; SSSE3-NEXT: pandn %xmm0, %xmm12
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm11, %xmm11
-; SSSE3-NEXT: pxor %xmm10, %xmm10
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
; SSSE3-NEXT: pandn %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm11, %xmm11
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT: pand %xmm10, %xmm13
-; SSSE3-NEXT: por %xmm13, %xmm0
-; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm10, %xmm4
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm13, %xmm0
; SSSE3-NEXT: por %xmm12, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm12
; SSSE3-NEXT: pxor %xmm8, %xmm12
@@ -1860,16 +1858,16 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm5, %xmm4
; SSSE3-NEXT: pxor %xmm12, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm12
-; SSSE3-NEXT: pandn %xmm1, %xmm12
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm13
+; SSSE3-NEXT: pandn %xmm1, %xmm13
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
; SSSE3-NEXT: pandn %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm12, %xmm5
; SSSE3-NEXT: pand %xmm10, %xmm5
; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: por %xmm12, %xmm1
+; SSSE3-NEXT: por %xmm13, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm12
; SSSE3-NEXT: pxor %xmm8, %xmm12
; SSSE3-NEXT: psubq %xmm6, %xmm2
@@ -1895,10 +1893,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pxor %xmm4, %xmm5
; SSSE3-NEXT: movdqa %xmm5, %xmm4
; SSSE3-NEXT: pandn %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
; SSSE3-NEXT: pandn %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm12, %xmm6
; SSSE3-NEXT: pand %xmm10, %xmm6
; SSSE3-NEXT: por %xmm6, %xmm2
; SSSE3-NEXT: pand %xmm5, %xmm2
@@ -1928,11 +1926,11 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pxor %xmm5, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm5
; SSSE3-NEXT: pandn %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
; SSSE3-NEXT: pandn %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm6
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm11
+; SSSE3-NEXT: pand %xmm10, %xmm11
+; SSSE3-NEXT: por %xmm11, %xmm3
; SSSE3-NEXT: pand %xmm4, %xmm3
; SSSE3-NEXT: por %xmm5, %xmm3
; SSSE3-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 05c6a799fd4d..9361af10962e 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -1893,24 +1893,21 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psrlq $1, %xmm4
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: pextrq $1, %xmm2, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: addps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: psrlq $1, %xmm2
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cvtsi2ss %rax, %xmm3
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: xorps %xmm2, %xmm2
+; SSE41-NEXT: cvtsi2ss %rax, %xmm2
+; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: addps %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; VEX-LABEL: uitofp_2i64_to_4f32:
@@ -2011,24 +2008,21 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psrlq $1, %xmm4
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: pextrq $1, %xmm2, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: addps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE41-NEXT: psrlq $1, %xmm2
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: xorps %xmm2, %xmm2
+; SSE41-NEXT: cvtsi2ss %rax, %xmm2
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: cvtsi2ss %rax, %xmm3
+; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],xmm2[0],zero,zero
+; SSE41-NEXT: movaps %xmm3, %xmm2
+; SSE41-NEXT: addps %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm3[0],zero
; SSE41-NEXT: retq
;
; VEX-LABEL: uitofp_2i64_to_2f32:
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index c341db6bc91f..ede201903a47 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -778,11 +778,11 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE-NEXT: por %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pxor %xmm3, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index ec30dba090b7..ea795d9b51f7 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -868,9 +868,9 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
;
; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
@@ -1007,17 +1007,17 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
@@ -1169,9 +1169,9 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
;
; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
@@ -1410,9 +1410,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
;
; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index a6a0f015cfbd..656f678ca2bf 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -61,9 +61,9 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %x) {
define <2 x i64> @test_pcmpgtq(<2 x i64> %x) {
; SSE2-LABEL: test_pcmpgtq:
; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_pcmpgtq:
@@ -186,11 +186,11 @@ define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) {
define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) {
; SSE2-LABEL: test_pcmpgtq_256:
; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_pcmpgtq_256:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 46d2d97ca86e..ab84a589c2cd 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -653,9 +653,9 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; XOPAVX2-LABEL: splatvar_shift_v2i64:
; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
@@ -855,9 +855,9 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
;
; XOPAVX2-LABEL: splatvar_shift_v16i8:
; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 61908e2241b6..7cea8eb9e8b9 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -705,9 +705,9 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
;
; XOPAVX2-LABEL: splatvar_shift_v16i8:
; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
More information about the llvm-commits
mailing list