[llvm] c3bf6d2 - [X86] Fold PSHUF(VSHIFT(X,Y)) -> VSHIFT(PSHUF(X),Y)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 22 12:02:47 PDT 2023
Author: Simon Pilgrim
Date: 2023-04-22T20:02:27+01:00
New Revision: c3bf6d20ac306b829dc99939b3a8f9487f7f1c9a
URL: https://github.com/llvm/llvm-project/commit/c3bf6d20ac306b829dc99939b3a8f9487f7f1c9a
DIFF: https://github.com/llvm/llvm-project/commit/c3bf6d20ac306b829dc99939b3a8f9487f7f1c9a.diff
LOG: [X86] Fold PSHUF(VSHIFT(X,Y)) -> VSHIFT(PSHUF(X),Y)
PSHUFD/PSHUFLW/PSHUFHW can act as a vector move / folded load, notably helping simplify pre-AVX cases in particular.
This is a much milder alternative to refactoring canonicalizeShuffleWithBinOps to support SSE shifts nodes.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/abds-vector-128.ll
llvm/test/CodeGen/X86/abdu-vector-128.ll
llvm/test/CodeGen/X86/avx512-cmp.ll
llvm/test/CodeGen/X86/combine-abs.ll
llvm/test/CodeGen/X86/combine-mul.ll
llvm/test/CodeGen/X86/combine-ptest.ll
llvm/test/CodeGen/X86/combine-sdiv.ll
llvm/test/CodeGen/X86/freeze-binary.ll
llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
llvm/test/CodeGen/X86/known-signbits-vector.ll
llvm/test/CodeGen/X86/packss.ll
llvm/test/CodeGen/X86/pr32907.ll
llvm/test/CodeGen/X86/promote-cmp.ll
llvm/test/CodeGen/X86/rotate_vec.ll
llvm/test/CodeGen/X86/sadd_sat_vec.ll
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/select-sra.ll
llvm/test/CodeGen/X86/shift-logic.ll
llvm/test/CodeGen/X86/shrink_vmul.ll
llvm/test/CodeGen/X86/shuffle-of-shift.ll
llvm/test/CodeGen/X86/ssub_sat_vec.ll
llvm/test/CodeGen/X86/vec_shift5.ll
llvm/test/CodeGen/X86/vector-bo-select.ll
llvm/test/CodeGen/X86/vector-sext.ll
llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
llvm/test/CodeGen/X86/viabs.ll
llvm/test/CodeGen/X86/vselect-zero.ll
llvm/test/CodeGen/X86/vselect.ll
llvm/test/CodeGen/X86/vshift-3.ll
llvm/test/CodeGen/X86/vsplit-and.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e07b597781f4..533542f46477 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42075,10 +42075,37 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
- case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFHW: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ if (N0->hasOneUse()) {
+ SDValue V = peekThroughOneUseBitcasts(N0);
+ switch (V.getOpcode()) {
+ case X86ISD::VSHL:
+ case X86ISD::VSRL:
+ case X86ISD::VSRA:
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI:
+ case X86ISD::VROTLI:
+ case X86ISD::VROTRI: {
+ MVT InnerVT = V.getSimpleValueType();
+ if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
+ SDValue Res = DAG.getNode(Opcode, DL, VT,
+ DAG.getBitcast(VT, V.getOperand(0)), N1);
+ Res = DAG.getBitcast(InnerVT, Res);
+ Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ }
+ }
+ }
+
Mask = getPSHUFShuffleMask(N);
assert(Mask.size() == 4);
break;
+ }
case X86ISD::MOVSD:
case X86ISD::MOVSH:
case X86ISD::MOVSS: {
diff --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll
index a48781c6ebf8..4e0bbb8115c9 100644
--- a/llvm/test/CodeGen/X86/abds-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-128.ll
@@ -81,44 +81,36 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: psubq %xmm10, %xmm7
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
; SSE2-NEXT: psubq %xmm1, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: psubq %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: psubq %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm5
; SSE2-NEXT: psubq %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: psubq %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm6
; SSE2-NEXT: psubq %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm7
; SSE2-NEXT: psubq %xmm1, %xmm7
-; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm8
; SSE2-NEXT: psubq %xmm1, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
@@ -233,44 +225,36 @@ define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: psubq %xmm10, %xmm7
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
; SSE2-NEXT: psubq %xmm1, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: psubq %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: psubq %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm5
; SSE2-NEXT: psubq %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: psubq %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm6
; SSE2-NEXT: psubq %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm7
; SSE2-NEXT: psubq %xmm1, %xmm7
-; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm8
; SSE2-NEXT: psubq %xmm1, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
@@ -378,14 +362,12 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; SSE2-NEXT: psubq %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: psubq %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
@@ -432,14 +414,12 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; SSE2-NEXT: psubq %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: psubq %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
@@ -1023,9 +1003,8 @@ define <2 x i64> @abd_subnsw_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: abd_subnsw_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
; SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index 88496032aa30..5281fc78b025 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -81,24 +81,20 @@ define <8 x i16> @abd_ext_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-NEXT: psubq %xmm6, %xmm4
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: psubq %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: psubq %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: psubq %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -162,24 +158,20 @@ define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-NEXT: psubq %xmm6, %xmm4
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: psubq %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: psubq %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: psubq %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -229,14 +221,12 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: psubq %xmm4, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: psubq %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
@@ -276,14 +266,12 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: psubq %xmm4, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: psubq %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll
index 06da8f7631d4..919edb334b36 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp.ll
@@ -191,8 +191,8 @@ define <8 x i32> @legalize_loop(<8 x double> %arg) {
; KNL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; KNL-NEXT: vcmpnltpd %zmm0, %zmm1, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpsrld $31, %ymm0, %ymm1
-; KNL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; KNL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[3,2,1,0,7,6,5,4]
+; KNL-NEXT: vpsrld $31, %ymm1, %ymm1
; KNL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; KNL-NEXT: vpsubd %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
@@ -202,8 +202,8 @@ define <8 x i32> @legalize_loop(<8 x double> %arg) {
; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; SKX-NEXT: vcmpnltpd %zmm0, %zmm1, %k0
; SKX-NEXT: vpmovm2d %k0, %ymm0
-; SKX-NEXT: vpsrld $31, %ymm0, %ymm1
-; SKX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; SKX-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[3,2,1,0,7,6,5,4]
+; SKX-NEXT: vpsrld $31, %ymm1, %ymm1
; SKX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; SKX-NEXT: vpsubd %ymm0, %ymm1, %ymm0
; SKX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-abs.ll b/llvm/test/CodeGen/X86/combine-abs.ll
index de20b4dccb20..410218b33eb9 100644
--- a/llvm/test/CodeGen/X86/combine-abs.ll
+++ b/llvm/test/CodeGen/X86/combine-abs.ll
@@ -107,14 +107,12 @@ define <32 x i8> @combine_v32i8_abs_abs(<32 x i8> %a) {
define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) {
; SSE2-LABEL: combine_v4i64_abs_abs:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: psubq %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: psubq %xmm2, %xmm1
; SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 9bac05eccabd..9d7afb9478b1 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -325,9 +325,8 @@ define <16 x i8> @combine_mul_to_abs_v16i8(<16 x i8> %x) {
define <2 x i64> @combine_mul_to_abs_v2i64(<2 x i64> %x) {
; SSE-LABEL: combine_mul_to_abs_v2i64:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll
index 7d11745e0503..337edef96bee 100644
--- a/llvm/test/CodeGen/X86/combine-ptest.ll
+++ b/llvm/test/CodeGen/X86/combine-ptest.ll
@@ -265,9 +265,9 @@ define i32 @ptestz_v2i64_signbits(<2 x i64> %c, i32 %a, i32 %b) {
; SSE41-LABEL: ptestz_v2i64_signbits:
; SSE41: # %bb.0:
; SSE41-NEXT: movl %edi, %eax
-; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: ptest %xmm0, %xmm0
+; SSE41-NEXT: movmskps %xmm0, %ecx
+; SSE41-NEXT: testl %ecx, %ecx
; SSE41-NEXT: cmovnel %esi, %eax
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index c8bee6bdcca5..80455eabe9f6 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -1540,9 +1540,8 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: psrlq $62, %xmm1
; SSE2-NEXT: paddq %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSE2-NEXT: psrad $2, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE2-NEXT: psrlq $2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -1625,16 +1624,14 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: psrlq $62, %xmm2
; SSE2-NEXT: paddq %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
; SSE2-NEXT: psrad $2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE2-NEXT: psrlq $2, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: psrlq $61, %xmm3
; SSE2-NEXT: psrlq $60, %xmm2
@@ -1660,9 +1657,8 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
; SSE41-NEXT: psrlq $2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrlq $60, %xmm3
; SSE41-NEXT: psrlq $61, %xmm2
@@ -1761,9 +1757,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: psrlq $62, %xmm4
; SSE2-NEXT: paddq %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
; SSE2-NEXT: psrad $2, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
; SSE2-NEXT: psrlq $2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
@@ -1772,16 +1767,14 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: psrlq $62, %xmm4
; SSE2-NEXT: paddq %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
; SSE2-NEXT: psrad $2, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
; SSE2-NEXT: psrlq $2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: psrlq $61, %xmm5
; SSE2-NEXT: psrlq $60, %xmm4
@@ -1794,9 +1787,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE2-NEXT: movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
; SSE2-NEXT: xorpd %xmm4, %xmm1
; SSE2-NEXT: psubq %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: movdqa %xmm5, %xmm6
; SSE2-NEXT: psrlq $61, %xmm6
; SSE2-NEXT: psrlq $60, %xmm5
@@ -1830,9 +1822,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE41-NEXT: psrlq $2, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE41-NEXT: psrad $31, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrlq $60, %xmm5
; SSE41-NEXT: psrlq $61, %xmm4
@@ -1845,9 +1836,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
; SSE41-NEXT: pxor %xmm4, %xmm1
; SSE41-NEXT: psubq %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
; SSE41-NEXT: psrad $31, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE41-NEXT: movdqa %xmm5, %xmm6
; SSE41-NEXT: psrlq $60, %xmm6
; SSE41-NEXT: psrlq $61, %xmm5
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index ea6e1fb36620..defd81e6ab77 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -541,8 +541,8 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind {
; X86-LABEL: freeze_ashr_vec_outofrange:
; X86: # %bb.0:
-; X86-NEXT: psrad $1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; X86-NEXT: psrad $1, %xmm0
; X86-NEXT: psrad $2, %xmm0
; X86-NEXT: retl
;
@@ -650,8 +650,8 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind {
; X86-LABEL: freeze_lshr_vec_outofrange:
; X86: # %bb.0:
-; X86-NEXT: psrld $1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; X86-NEXT: psrld $1, %xmm0
; X86-NEXT: psrld $2, %xmm0
; X86-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
index f04aa2cd9ba3..6f2be411217b 100644
--- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
+++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
@@ -129,14 +129,12 @@ define <4 x i1> @illegal_abs_to_eq_or(<4 x i64> %x) {
;
; SSE2-LABEL: illegal_abs_to_eq_or:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: psubq %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: psubq %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [129,129]
@@ -189,14 +187,12 @@ define <4 x i64> @illegal_abs_to_eq_or_sext(<4 x i64> %x) {
;
; SSE2-LABEL: illegal_abs_to_eq_or_sext:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: psubq %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: psubq %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [129,129]
@@ -259,14 +255,12 @@ define <4 x i1> @illegal_abs_to_ne_and(<4 x i64> %x) {
;
; SSE2-LABEL: illegal_abs_to_ne_and:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: psubq %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: psubq %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [129,129]
@@ -328,14 +322,12 @@ define <4 x i64> @illegal_abs_to_ne_and_sext(<4 x i64> %x) {
;
; SSE2-LABEL: illegal_abs_to_ne_and_sext:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: psubq %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: psubq %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [129,129]
@@ -600,8 +592,8 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) {
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -628,8 +620,8 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,3,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: psllq $63, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: retq
%cmp1 = icmp eq <4 x i64> %x, <i64 129, i64 129, i64 129, i64 129>
%cmp2 = icmp eq <4 x i64> %x, <i64 -129, i64 -129, i64 -129, i64 -129>
@@ -744,8 +736,8 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) {
; SSE41-NEXT: pmovsxdq %xmm2, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
; SSE41-NEXT: psllq $63, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: retq
;
; SSE2-LABEL: ne_and_to_abs_vec4x64_sext:
@@ -774,8 +766,8 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,3,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: psllq $63, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: retq
%cmp1 = icmp ne <4 x i64> %x, <i64 129, i64 129, i64 129, i64 129>
%cmp2 = icmp ne <4 x i64> %x, <i64 -129, i64 -129, i64 -129, i64 -129>
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
index df1cfb8337a0..02078c3575dd 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
@@ -253,9 +253,8 @@ define <2 x i1> @abs_ne_vec(<2 x i64> %0) nounwind {
;
; X64-LABEL: abs_ne_vec:
; X64: # %bb.0:
-; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-NEXT: psrad $31, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: psubq %xmm1, %xmm0
; X64-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index 6b9491500b75..25f02ac6732c 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -192,8 +192,8 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
; X86-LABEL: signbits_ashr_shl_extract_sitofp:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vpsrad $29, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT: vpsrad $29, %xmm0, %xmm0
; X86-NEXT: vpsllq $20, %xmm0, %xmm0
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -203,8 +203,8 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
;
; X64-LABEL: signbits_ashr_shl_extract_sitofp:
; X64: # %bb.0:
-; X64-NEXT: vpsrad $29, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: vpsrad $29, %xmm0, %xmm0
; X64-NEXT: vpsllq $20, %xmm0, %xmm0
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
@@ -338,8 +338,8 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
; X86-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vpsrad $29, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT: vpsrad $29, %xmm0, %xmm0
; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
@@ -350,8 +350,8 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
;
; X64-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
; X64: # %bb.0:
-; X64-NEXT: vpsrad $29, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: vpsrad $29, %xmm0, %xmm0
; X64-NEXT: vmovd %edi, %xmm1
; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
@@ -371,8 +371,8 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
; X86-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vpsrad $29, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT: vpsrad $29, %xmm0, %xmm0
; X86-NEXT: vpxor %xmm0, %xmm1, %xmm0
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -382,8 +382,8 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
;
; X64-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp:
; X64: # %bb.0:
-; X64-NEXT: vpsrad $29, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: vpsrad $29, %xmm0, %xmm0
; X64-NEXT: vpxor %xmm0, %xmm1, %xmm0
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
@@ -407,13 +407,13 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
; X86-NEXT: subl $16, %esp
; X86-NEXT: vmovapd 8(%ebp), %xmm3
; X86-NEXT: vpsrad $31, %xmm2, %xmm4
-; X86-NEXT: vpsrad $1, %xmm2, %xmm5
-; X86-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X86-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; X86-NEXT: vpsrad $1, %xmm5, %xmm5
; X86-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
; X86-NEXT: vextractf128 $1, %ymm2, %xmm2
; X86-NEXT: vpsrad $31, %xmm2, %xmm5
-; X86-NEXT: vpsrad $1, %xmm2, %xmm2
; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-NEXT: vpsrad $1, %xmm2, %xmm2
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
; X86-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6
@@ -435,13 +435,13 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
; X64-AVX1-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vpsrad $31, %xmm2, %xmm4
-; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm5
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; X64-AVX1-NEXT: vpsrad $1, %xmm5, %xmm5
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; X64-AVX1-NEXT: vpsrad $31, %xmm2, %xmm5
-; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm2
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm2
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
; X64-AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6
@@ -460,8 +460,8 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
;
; X64-AVX2-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpsrad $1, %ymm2, %ymm2
; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; X64-AVX2-NEXT: vpsrad $1, %ymm2, %ymm2
; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; X64-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll
index 4475e5480a88..821d233b8237 100644
--- a/llvm/test/CodeGen/X86/packss.ll
+++ b/llvm/test/CodeGen/X86/packss.ll
@@ -9,10 +9,10 @@
define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
; SSE-LABEL: trunc_ashr_v4i64:
; SSE: # %bb.0:
-; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: ret{{[l|q]}}
;
@@ -42,17 +42,15 @@ define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
define <8 x i16> @trunc_ashr_v4i64_bitcast(<4 x i64> %a0) {
; SSE-LABEL: trunc_ashr_v4i64_bitcast:
; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE-NEXT: psrad $17, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE-NEXT: psrad $17, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: ret{{[l|q]}}
@@ -61,12 +59,12 @@ define <8 x i16> @trunc_ashr_v4i64_bitcast(<4 x i64> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $17, %xmm1, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpsrad $17, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
@@ -75,8 +73,8 @@ define <8 x i16> @trunc_ashr_v4i64_bitcast(<4 x i64> %a0) {
; AVX2-LABEL: trunc_ashr_v4i64_bitcast:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
-; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr32907.ll b/llvm/test/CodeGen/X86/pr32907.ll
index 5d7b282b58fb..43abf1fd9754 100644
--- a/llvm/test/CodeGen/X86/pr32907.ll
+++ b/llvm/test/CodeGen/X86/pr32907.ll
@@ -8,9 +8,8 @@ define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) {
; SSE2-LABEL: PR32907:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: psubq %xmm0, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll
index bc824d4fd5e3..529396ca4617 100644
--- a/llvm/test/CodeGen/X86/promote-cmp.ll
+++ b/llvm/test/CodeGen/X86/promote-cmp.ll
@@ -37,8 +37,8 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,1,3,3]
; SSE2-NEXT: psllq $63, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll
index ff1bcd6437d2..ae74e9e6e042 100644
--- a/llvm/test/CodeGen/X86/rotate_vec.ll
+++ b/llvm/test/CodeGen/X86/rotate_vec.ll
@@ -135,8 +135,8 @@ define <4 x i32> @rot_v4i32_mask_ashr0(<4 x i32> %a0) {
define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
; XOPAVX1-LABEL: rot_v4i32_mask_ashr1:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0
; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index 4abb2307a4a6..ab8a8d3bfc5b 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -1134,13 +1134,13 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i64:
@@ -1162,13 +1162,13 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm0, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i64:
@@ -1244,108 +1244,106 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-LABEL: v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: paddq %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: paddq %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: movdqa %xmm7, %xmm6
-; SSE2-NEXT: pandn %xmm0, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm7, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: pxor %xmm7, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm6, %xmm2
; SSE2-NEXT: paddq %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pxor %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pxor %xmm7, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: paddq %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm6
-; SSSE3-NEXT: movdqa %xmm5, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm6, %xmm0
+; SSSE3-NEXT: paddq %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm5, %xmm5
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pxor %xmm6, %xmm7
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pandn %xmm0, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm7, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm5, %xmm0
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: pxor %xmm7, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm6, %xmm2
; SSSE3-NEXT: paddq %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm6, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
+; SSSE3-NEXT: pxor %xmm1, %xmm6
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm1, %xmm3
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pxor %xmm7, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm5
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i64:
@@ -1448,204 +1446,202 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-LABEL: v8i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: paddq %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm9, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm10, %xmm0
+; SSE2-NEXT: paddq %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm10, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT: pxor %xmm9, %xmm9
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
-; SSE2-NEXT: pxor %xmm10, %xmm11
-; SSE2-NEXT: movdqa %xmm11, %xmm10
-; SSE2-NEXT: pandn %xmm0, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pand %xmm11, %xmm0
-; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: paddq %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm11
-; SSE2-NEXT: pxor %xmm8, %xmm11
-; SSE2-NEXT: movdqa %xmm10, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: pxor %xmm11, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm10, %xmm1
+; SSE2-NEXT: paddq %xmm5, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm10, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm12
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
-; SSE2-NEXT: por %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm10
-; SSE2-NEXT: pxor %xmm11, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm13, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pand %xmm10, %xmm1
+; SSE2-NEXT: pxor %xmm11, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm8, %xmm5
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm10, %xmm4
; SSE2-NEXT: paddq %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm5, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: por %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: pxor %xmm11, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm10, %xmm2
; SSE2-NEXT: paddq %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm3, %xmm8
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm3, %xmm10
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT: pxor %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: pxor %xmm11, %xmm5
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm9
-; SSSE3-NEXT: pxor %xmm8, %xmm9
-; SSSE3-NEXT: paddq %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm10
-; SSSE3-NEXT: pxor %xmm8, %xmm10
-; SSSE3-NEXT: movdqa %xmm9, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
+; SSSE3-NEXT: movdqa %xmm1, %xmm8
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm10, %xmm0
+; SSSE3-NEXT: paddq %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm9
+; SSSE3-NEXT: pxor %xmm10, %xmm9
+; SSSE3-NEXT: movdqa %xmm0, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11
; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm9, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSSE3-NEXT: pand %xmm12, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm9, %xmm9
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pxor %xmm11, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11
-; SSSE3-NEXT: pxor %xmm10, %xmm11
-; SSSE3-NEXT: movdqa %xmm11, %xmm10
-; SSSE3-NEXT: pandn %xmm0, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm11, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm9, %xmm0
-; SSSE3-NEXT: pand %xmm11, %xmm0
-; SSSE3-NEXT: por %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm10
-; SSSE3-NEXT: pxor %xmm8, %xmm10
-; SSSE3-NEXT: paddq %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm11
-; SSSE3-NEXT: pxor %xmm8, %xmm11
-; SSSE3-NEXT: movdqa %xmm10, %xmm12
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12
+; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: pxor %xmm11, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm4
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm8, %xmm1
+; SSSE3-NEXT: pxor %xmm10, %xmm1
+; SSSE3-NEXT: paddq %xmm5, %xmm8
+; SSSE3-NEXT: movdqa %xmm8, %xmm4
+; SSSE3-NEXT: pxor %xmm10, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, %xmm12
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12
; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
-; SSSE3-NEXT: por %xmm10, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm10, %xmm10
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10
-; SSSE3-NEXT: pxor %xmm11, %xmm10
-; SSSE3-NEXT: movdqa %xmm10, %xmm5
-; SSSE3-NEXT: pandn %xmm1, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm13, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: pand %xmm10, %xmm1
+; SSSE3-NEXT: pxor %xmm11, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm8, %xmm5
; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm10, %xmm4
; SSSE3-NEXT: paddq %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm10
-; SSSE3-NEXT: pxor %xmm8, %xmm10
-; SSSE3-NEXT: movdqa %xmm5, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm10, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm12, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm10, %xmm6
-; SSSE3-NEXT: movdqa %xmm6, %xmm5
-; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm2
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: por %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: pxor %xmm11, %xmm4
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: pandn %xmm2, %xmm6
+; SSSE3-NEXT: por %xmm6, %xmm4
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm10, %xmm2
; SSSE3-NEXT: paddq %xmm7, %xmm3
-; SSSE3-NEXT: pxor %xmm3, %xmm8
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm8
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm6, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm3, %xmm5
-; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm3
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: por %xmm5, %xmm3
+; SSSE3-NEXT: pxor %xmm3, %xmm10
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9
+; SSSE3-NEXT: pxor %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm5
+; SSSE3-NEXT: pxor %xmm11, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: pandn %xmm3, %xmm9
+; SSSE3-NEXT: por %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm2
+; SSSE3-NEXT: movdqa %xmm5, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i64:
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index b7387651f8c4..0d3d32572360 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -579,9 +579,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; X64-NEXT: psllq $32, %xmm3
-; X64-NEXT: movdqa %xmm3, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
; X64-NEXT: psrad $31, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; X64-NEXT: psrlq $31, %xmm3
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -703,9 +702,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[0,1,1,3]
; X64-NEXT: psllq $32, %xmm0
-; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; X64-NEXT: psrad $31, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X64-NEXT: psrlq $31, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/select-sra.ll b/llvm/test/CodeGen/X86/select-sra.ll
index 2a3f8a125e4a..d01d69cd6499 100644
--- a/llvm/test/CodeGen/X86/select-sra.ll
+++ b/llvm/test/CodeGen/X86/select-sra.ll
@@ -89,8 +89,8 @@ define <4 x i32> @isnonneg_v4i32(<4 x i32> %x) {
define <2 x i64> @isnonneg_v2i64(<2 x i64> %x) {
; CHECK-LABEL: isnonneg_v2i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: retq
%cond = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
@@ -186,8 +186,8 @@ define <4 x i32> @isneg_v4i32(<4 x i32> %x) {
define <2 x i64> @isneg_v2i64(<2 x i64> %x) {
; CHECK-LABEL: isneg_v2i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: retq
%cond = icmp slt <2 x i64> %x, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/shift-logic.ll b/llvm/test/CodeGen/X86/shift-logic.ll
index d098932672f3..6c96cc4e78f9 100644
--- a/llvm/test/CodeGen/X86/shift-logic.ll
+++ b/llvm/test/CodeGen/X86/shift-logic.ll
@@ -110,15 +110,13 @@ define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %y) nounwind {
define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) nounwind {
; CHECK-LABEL: ashr_or:
; CHECK: # %bb.0:
-; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-NEXT: psrad $7, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-NEXT: psrlq $7, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-NEXT: psrad $12, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-NEXT: psrlq $12, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 26e2382368d1..0177acfc9248 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1921,9 +1921,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: psllq $32, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
@@ -1944,9 +1942,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: psllq $32, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
diff --git a/llvm/test/CodeGen/X86/shuffle-of-shift.ll b/llvm/test/CodeGen/X86/shuffle-of-shift.ll
index 19a44cef805d..e2dc74d4e4df 100644
--- a/llvm/test/CodeGen/X86/shuffle-of-shift.ll
+++ b/llvm/test/CodeGen/X86/shuffle-of-shift.ll
@@ -9,14 +9,14 @@
define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x) nounwind {
; SSE2-LABEL: shuffle_i32_of_shl_i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i32_of_shl_i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15)
%i2 = bitcast <8 x i16> %i1 to <4 x i32>
@@ -26,14 +26,14 @@ define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x) nounwind {
define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x) nounwind {
; SSE2-LABEL: shuffle_i32_of_lshr_i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrlw $15, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT: psrlw $15, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i32_of_lshr_i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15)
%i2 = bitcast <8 x i16> %i1 to <4 x i32>
@@ -43,14 +43,14 @@ define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x) nounwind {
define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x) nounwind {
; SSE2-LABEL: shuffle_i32_of_ashr_i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psraw $15, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT: psraw $15, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i32_of_ashr_i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15)
%i2 = bitcast <8 x i16> %i1 to <4 x i32>
@@ -61,14 +61,14 @@ define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x) nounwind {
define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: shuffle_i32_of_shl_i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i32_of_shl_i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31)
%i2 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -77,14 +77,14 @@ define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x) nounwind {
define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: shuffle_i32_of_lshr_i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrld $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT: psrld $31, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i32_of_lshr_i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31)
%i2 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -93,14 +93,14 @@ define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x) nounwind {
define <4 x i32> @shuffle_i32_of_ashr_i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: shuffle_i32_of_ashr_i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i32_of_ashr_i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31)
%i2 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -186,14 +186,14 @@ define <4 x i32> @shuffle_i32_of_ashr_i64(<2 x i64> %x) nounwind {
define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x) nounwind {
; SSE2-LABEL: shuffle_i64_of_shl_i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i64_of_shl_i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15)
%i2 = bitcast <8 x i16> %i1 to <2 x i64>
@@ -203,14 +203,14 @@ define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x) nounwind {
define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x) nounwind {
; SSE2-LABEL: shuffle_i64_of_lshr_i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrlw $15, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: psrlw $15, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i64_of_lshr_i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15)
%i2 = bitcast <8 x i16> %i1 to <2 x i64>
@@ -220,14 +220,14 @@ define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x) nounwind {
define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x) nounwind {
; SSE2-LABEL: shuffle_i64_of_ashr_i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psraw $15, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: psraw $15, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i64_of_ashr_i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15)
%i2 = bitcast <8 x i16> %i1 to <2 x i64>
@@ -238,14 +238,14 @@ define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x) nounwind {
define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: shuffle_i64_of_shl_i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i64_of_shl_i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31)
%i2 = bitcast <4 x i32> %i1 to <2 x i64>
@@ -255,14 +255,14 @@ define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x) nounwind {
define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: shuffle_i64_of_lshr_i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrld $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: psrld $31, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i64_of_lshr_i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31)
%i2 = bitcast <4 x i32> %i1 to <2 x i64>
@@ -272,14 +272,14 @@ define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x) nounwind {
define <2 x i64> @shuffle_i64_of_ashr_i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: shuffle_i64_of_ashr_i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; AVX2-LABEL: shuffle_i64_of_ashr_i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX2-NEXT: ret{{[l|q]}}
%i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31)
%i2 = bitcast <4 x i32> %i1 to <2 x i64>
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 111b7e763e18..3c87f3b06a77 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -1210,13 +1210,13 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i64:
@@ -1245,13 +1245,13 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm0, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i64:
@@ -1341,134 +1341,132 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-LABEL: v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: psubq %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm5, %xmm0
+; SSE2-NEXT: psubq %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm7
; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm7
+; SSE2-NEXT: pxor %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm7
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm2
; SSE2-NEXT: psubq %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pxor %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: psubq %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm6
-; SSSE3-NEXT: movdqa %xmm5, %xmm7
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm5, %xmm0
+; SSSE3-NEXT: psubq %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm0, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
+; SSSE3-NEXT: por %xmm0, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm6, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pandn %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm7
+; SSSE3-NEXT: pxor %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: pxor %xmm6, %xmm0
+; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm7
+; SSSE3-NEXT: por %xmm7, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm2
; SSSE3-NEXT: psubq %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm6
-; SSSE3-NEXT: movdqa %xmm5, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm5, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm6, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm1, %xmm3
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pxor %xmm6, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm5
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i64:
@@ -1592,254 +1590,252 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-LABEL: v8i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: psubq %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm9, %xmm0
+; SSE2-NEXT: psubq %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: pxor %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm0, %xmm11
; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm10
+; SSE2-NEXT: pxor %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm9
-; SSE2-NEXT: pxor %xmm10, %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm10
-; SSE2-NEXT: pandn %xmm0, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm11
+; SSE2-NEXT: pxor %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm9, %xmm0
-; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: psubq %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm9, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: pxor %xmm10, %xmm0
+; SSE2-NEXT: pand %xmm11, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm11
+; SSE2-NEXT: por %xmm11, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm9, %xmm1
+; SSE2-NEXT: psubq %xmm5, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm10, %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm11
+; SSE2-NEXT: pxor %xmm4, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm9, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pxor %xmm10, %xmm1
+; SSE2-NEXT: pand %xmm11, %xmm1
+; SSE2-NEXT: pandn %xmm8, %xmm11
+; SSE2-NEXT: por %xmm11, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm4
; SSE2-NEXT: psubq %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm8
+; SSE2-NEXT: pxor %xmm5, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: pxor %xmm10, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm9, %xmm2
; SSE2-NEXT: psubq %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: movdqa %xmm7, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm7
+; SSE2-NEXT: por %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm7
+; SSE2-NEXT: movdqa %xmm7, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm9, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm3, %xmm6
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: pxor %xmm10, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm9
-; SSSE3-NEXT: pxor %xmm8, %xmm9
-; SSSE3-NEXT: psubq %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm10
-; SSSE3-NEXT: pxor %xmm8, %xmm10
-; SSSE3-NEXT: movdqa %xmm9, %xmm11
+; SSSE3-NEXT: movdqa %xmm1, %xmm8
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm9, %xmm0
+; SSSE3-NEXT: psubq %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm10
+; SSSE3-NEXT: pxor %xmm9, %xmm10
+; SSSE3-NEXT: movdqa %xmm0, %xmm11
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm9
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSSE3-NEXT: pand %xmm12, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm9, %xmm10
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm9
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4
+; SSSE3-NEXT: por %xmm0, %xmm10
+; SSSE3-NEXT: pxor %xmm9, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pand %xmm11, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm9
-; SSSE3-NEXT: pxor %xmm10, %xmm9
-; SSSE3-NEXT: movdqa %xmm9, %xmm10
-; SSSE3-NEXT: pandn %xmm0, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm11
+; SSSE3-NEXT: pxor %xmm10, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm9, %xmm0
-; SSSE3-NEXT: por %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm9
-; SSSE3-NEXT: pxor %xmm8, %xmm9
-; SSSE3-NEXT: psubq %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm10
-; SSSE3-NEXT: pxor %xmm8, %xmm10
-; SSSE3-NEXT: movdqa %xmm9, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
+; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: pxor %xmm10, %xmm0
+; SSSE3-NEXT: pand %xmm11, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm11
+; SSSE3-NEXT: por %xmm11, %xmm0
+; SSSE3-NEXT: movdqa %xmm8, %xmm1
+; SSSE3-NEXT: pxor %xmm9, %xmm1
+; SSSE3-NEXT: psubq %xmm5, %xmm8
+; SSSE3-NEXT: movdqa %xmm8, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11
; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm9, %xmm10
-; SSSE3-NEXT: pxor %xmm8, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm9
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm12, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm11, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm9
-; SSSE3-NEXT: pxor %xmm10, %xmm9
-; SSSE3-NEXT: movdqa %xmm9, %xmm5
-; SSSE3-NEXT: pandn %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm11
+; SSSE3-NEXT: pxor %xmm4, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm9, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: pxor %xmm10, %xmm1
+; SSSE3-NEXT: pand %xmm11, %xmm1
+; SSSE3-NEXT: pandn %xmm8, %xmm11
+; SSSE3-NEXT: por %xmm11, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm4
; SSSE3-NEXT: psubq %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm9
-; SSSE3-NEXT: pxor %xmm8, %xmm9
-; SSSE3-NEXT: movdqa %xmm5, %xmm10
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm9
-; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: movdqa %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
; SSSE3-NEXT: pxor %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pandn %xmm2, %xmm6
-; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm11, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pxor %xmm9, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm8
+; SSSE3-NEXT: pxor %xmm5, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: pxor %xmm10, %xmm4
+; SSSE3-NEXT: pand %xmm8, %xmm4
+; SSSE3-NEXT: pandn %xmm2, %xmm8
+; SSSE3-NEXT: por %xmm8, %xmm4
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm9, %xmm2
; SSSE3-NEXT: psubq %xmm7, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: movdqa %xmm5, %xmm9
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm7
-; SSSE3-NEXT: movdqa %xmm7, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7
+; SSSE3-NEXT: por %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm9, %xmm7
+; SSSE3-NEXT: movdqa %xmm7, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pand %xmm9, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm5
-; SSSE3-NEXT: pxor %xmm6, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pandn %xmm3, %xmm6
-; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: pand %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm5
+; SSSE3-NEXT: pxor %xmm10, %xmm5
+; SSSE3-NEXT: pand %xmm2, %xmm5
+; SSSE3-NEXT: pandn %xmm3, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm2
+; SSSE3-NEXT: movdqa %xmm5, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i64:
diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll
index b7cfdeb7aa5a..f8bc6b01c70a 100644
--- a/llvm/test/CodeGen/X86/vec_shift5.ll
+++ b/llvm/test/CodeGen/X86/vec_shift5.ll
@@ -258,8 +258,8 @@ define i32 @extelt1_add_psrai_v4i32_uses(<4 x i32> %x, <4 x i32> %y){
; CHECK-NEXT: movd %xmm1, %ecx
; CHECK-NEXT: addl $3, %ecx
; CHECK-NEXT: movd %ecx, %xmm1
-; CHECK-NEXT: psrad %xmm1, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT: psrad %xmm1, %xmm0
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: imull %ecx, %eax
; CHECK-NEXT: ret{{[l|q]}}
diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll
index e1d10688a422..d9b4432b9ce9 100644
--- a/llvm/test/CodeGen/X86/vector-bo-select.ll
+++ b/llvm/test/CodeGen/X86/vector-bo-select.ll
@@ -5527,8 +5527,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x,
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
; SSE2-NEXT: psllq $63, %xmm8
-; SSE2-NEXT: psrad $31, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm8
; SSE2-NEXT: movdqa %xmm8, %xmm10
; SSE2-NEXT: pandn %xmm7, %xmm10
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,1]
@@ -5537,8 +5537,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x,
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
; SSE2-NEXT: psllq $63, %xmm7
-; SSE2-NEXT: psrad $31, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm7
; SSE2-NEXT: movdqa %xmm7, %xmm10
; SSE2-NEXT: pandn %xmm6, %xmm10
; SSE2-NEXT: pand %xmm9, %xmm7
@@ -5546,8 +5546,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x,
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
; SSE2-NEXT: psllq $63, %xmm6
-; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm10
; SSE2-NEXT: pandn %xmm5, %xmm10
; SSE2-NEXT: pand %xmm9, %xmm6
@@ -5555,8 +5555,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x,
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm9
; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: por %xmm9, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 1c9a31b512b4..ed7d22356de8 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -1777,12 +1777,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
; SSE2-NEXT: pinsrw $6, %eax, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; SSE2-NEXT: psllq $63, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i1_to_4i64:
@@ -1807,12 +1807,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
; SSSE3-NEXT: pinsrw $6, %eax, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSSE3-NEXT: psllq $63, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; SSSE3-NEXT: psllq $63, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i1_to_4i64:
@@ -1837,12 +1837,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
; SSE41-NEXT: movzbl %al, %eax
; SSE41-NEXT: pinsrb $12, %eax, %xmm1
; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; SSE41-NEXT: psllq $63, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i1_to_4i64:
@@ -1941,12 +1941,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
; X86-SSE2-NEXT: pinsrw $6, %eax, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; X86-SSE2-NEXT: psllq $63, %xmm0
-; X86-SSE2-NEXT: psrad $31, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: psrad $31, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; X86-SSE2-NEXT: psllq $63, %xmm1
-; X86-SSE2-NEXT: psrad $31, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: psrad $31, %xmm1
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: load_sext_4i1_to_4i64:
@@ -1972,12 +1972,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
; X86-SSE41-NEXT: movzbl %al, %eax
; X86-SSE41-NEXT: pinsrb $12, %eax, %xmm1
; X86-SSE41-NEXT: psllq $63, %xmm0
-; X86-SSE41-NEXT: psrad $31, %xmm0
; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT: psrad $31, %xmm0
; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; X86-SSE41-NEXT: psllq $63, %xmm1
-; X86-SSE41-NEXT: psrad $31, %xmm1
; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT: psrad $31, %xmm1
; X86-SSE41-NEXT: retl
entry:
%X = load <4 x i1>, ptr %ptr
@@ -3683,38 +3683,34 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
; SSE2-NEXT: psllq $58, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: psrad $26, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
; SSE2-NEXT: psllq $58, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE2-NEXT: psrad $26, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
; SSE2-NEXT: psllq $58, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
; SSE2-NEXT: psrad $26, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
; SSE2-NEXT: psllq $58, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
; SSE2-NEXT: psrad $26, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE2-NEXT: retq
;
@@ -3727,38 +3723,34 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
; SSSE3-NEXT: psllq $58, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSSE3-NEXT: psrad $26, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
; SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
; SSSE3-NEXT: psllq $58, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSSE3-NEXT: psrad $26, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
; SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
; SSSE3-NEXT: psllq $58, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
; SSSE3-NEXT: psrad $26, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
; SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
; SSSE3-NEXT: psllq $58, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSSE3-NEXT: movdqa %xmm3, %xmm4
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
; SSSE3-NEXT: psrad $26, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSSE3-NEXT: retq
;
@@ -3768,36 +3760,32 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE41-NEXT: psllq $58, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; SSE41-NEXT: psllq $58, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: psrad $26, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: psllq $58, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psllq $58, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE41-NEXT: psrad $31, %xmm2
; SSE41-NEXT: psrad $26, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE41-NEXT: psllq $58, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE41-NEXT: psllq $58, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE41-NEXT: psrad $31, %xmm4
; SSE41-NEXT: psrad $26, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE41-NEXT: psllq $58, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; SSE41-NEXT: psllq $58, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE41-NEXT: psrad $31, %xmm4
; SSE41-NEXT: psrad $26, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
; SSE41-NEXT: retq
;
@@ -3851,38 +3839,34 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
; X86-SSE2-NEXT: psllq $58, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrad $31, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: psrad $26, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
; X86-SSE2-NEXT: psllq $58, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: psrad $31, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; X86-SSE2-NEXT: psrad $26, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
; X86-SSE2-NEXT: psllq $58, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
; X86-SSE2-NEXT: psrad $31, %xmm4
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
; X86-SSE2-NEXT: psrad $26, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
; X86-SSE2-NEXT: psllq $58, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
; X86-SSE2-NEXT: psrad $31, %xmm4
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
; X86-SSE2-NEXT: psrad $26, %xmm3
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; X86-SSE2-NEXT: retl
;
@@ -3892,36 +3876,32 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
; X86-SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; X86-SSE41-NEXT: psllq $58, %xmm0
-; X86-SSE41-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; X86-SSE41-NEXT: psllq $58, %xmm1
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; X86-SSE41-NEXT: psrad $31, %xmm1
; X86-SSE41-NEXT: psrad $26, %xmm0
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X86-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
-; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; X86-SSE41-NEXT: psllq $58, %xmm1
-; X86-SSE41-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X86-SSE41-NEXT: psllq $58, %xmm2
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; X86-SSE41-NEXT: psrad $31, %xmm2
; X86-SSE41-NEXT: psrad $26, %xmm1
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X86-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; X86-SSE41-NEXT: psllq $58, %xmm2
-; X86-SSE41-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; X86-SSE41-NEXT: psllq $58, %xmm4
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; X86-SSE41-NEXT: psrad $31, %xmm4
; X86-SSE41-NEXT: psrad $26, %xmm2
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; X86-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; X86-SSE41-NEXT: psllq $58, %xmm3
-; X86-SSE41-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; X86-SSE41-NEXT: psllq $58, %xmm4
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; X86-SSE41-NEXT: psrad $31, %xmm4
; X86-SSE41-NEXT: psrad $26, %xmm3
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; X86-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
; X86-SSE41-NEXT: retl
entry:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 5b6aac249919..ed1910d341a0 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1385,9 +1385,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: psraw $2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: andps %xmm0, %xmm1
@@ -1456,9 +1456,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; X86-SSE-NEXT: movapd %xmm1, %xmm2
; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X86-SSE-NEXT: psraw $2, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
; X86-SSE-NEXT: movaps %xmm2, %xmm1
; X86-SSE-NEXT: andps %xmm0, %xmm1
@@ -1571,9 +1571,8 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: splatconstant_shift_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSE2-NEXT: psrad $7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: psrlq $7, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1621,9 +1620,8 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
;
; X86-SSE-LABEL: splatconstant_shift_v2i64:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; X86-SSE-NEXT: psrad $7, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X86-SSE-NEXT: psrlq $7, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index f50ea52c948d..ec5cf43a357a 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1712,12 +1712,10 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $4, %xmm1
-; SSE2-NEXT: psrad $5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: psrad $4, %xmm0
+; SSE2-NEXT: psrad $5, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i32:
@@ -1762,12 +1760,10 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
;
; X86-SSE-LABEL: constant_shift_v2i32:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psrad $4, %xmm1
-; X86-SSE-NEXT: psrad $5, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT: psrad $4, %xmm0
+; X86-SSE-NEXT: psrad $5, %xmm1
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE-NEXT: retl
%shift = ashr <2 x i32> %a, <i32 4, i32 5>
ret <2 x i32> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index be9550095cd8..8d3cb45d396d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1418,12 +1418,10 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $4, %xmm1
-; SSE2-NEXT: psrld $5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: psrld $4, %xmm0
+; SSE2-NEXT: psrld $5, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i32:
@@ -1468,12 +1466,10 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
;
; X86-SSE-LABEL: constant_shift_v2i32:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psrld $4, %xmm1
-; X86-SSE-NEXT: psrld $5, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT: psrld $4, %xmm0
+; X86-SSE-NEXT: psrld $5, %xmm1
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE-NEXT: retl
%shift = lshr <2 x i32> %a, <i32 4, i32 5>
ret <2 x i32> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index c98954950ad2..e80a72d7e663 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -1258,12 +1258,10 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $4, %xmm1
-; SSE2-NEXT: pslld $5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: pslld $4, %xmm0
+; SSE2-NEXT: pslld $5, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i32:
@@ -1308,12 +1306,10 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
;
; X86-SSE-LABEL: constant_shift_v2i32:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: pslld $4, %xmm1
-; X86-SSE-NEXT: pslld $5, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT: pslld $4, %xmm0
+; X86-SSE-NEXT: pslld $5, %xmm1
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE-NEXT: retl
%shift = shl <2 x i32> %a, <i32 4, i32 5>
ret <2 x i32> %shift
diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
index 144a9e2e5439..1781196fc6f6 100644
--- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
@@ -3088,11 +3088,10 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %
; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: psrad $31, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrad $31, %xmm1
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
@@ -3114,11 +3113,10 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %
; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X64-SSE2-NEXT: psrad $31, %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrad $31, %xmm1
+; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
@@ -3143,53 +3141,51 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %a0) {
; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrad $31, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: psrad $1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrad $31, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X64-SSE2-NEXT: psrad $1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; X64-AVX2-NEXT: retq
%t0 = and <2 x i64> %a0, <i64 18446744065119617024, i64 18446744065119617024>
@@ -3199,53 +3195,51 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_34(<2 x i64> %a0) {
; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrad $31, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: psrad $2, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: psrad $31, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; X64-SSE2-NEXT: psrad $2, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; X64-AVX2-NEXT: retq
%t0 = and <2 x i64> %a0, <i64 18446744065119617024, i64 18446744065119617024>
diff --git a/llvm/test/CodeGen/X86/viabs.ll b/llvm/test/CodeGen/X86/viabs.ll
index b051a7a07e6a..974af2c089a1 100644
--- a/llvm/test/CodeGen/X86/viabs.ll
+++ b/llvm/test/CodeGen/X86/viabs.ll
@@ -524,18 +524,16 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind {
define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_abs_ge_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubq %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_ge_v2i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: psubq %xmm1, %xmm0
; SSSE3-NEXT: retq
@@ -574,28 +572,24 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
; SSE2-LABEL: test_abs_gt_v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: psubq %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: psubq %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_gt_v4i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: psubq %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: psubq %xmm2, %xmm1
; SSSE3-NEXT: retq
@@ -643,48 +637,40 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
; SSE2-LABEL: test_abs_le_v8i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: psubq %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: psubq %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: psubq %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: psubq %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_le_v8i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm4, %xmm0
; SSSE3-NEXT: psubq %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm4, %xmm1
; SSSE3-NEXT: psubq %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: psubq %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm4, %xmm3
; SSSE3-NEXT: psubq %xmm4, %xmm3
; SSSE3-NEXT: retq
@@ -751,24 +737,20 @@ define <8 x i64> @test_abs_le_v8i64_fold(ptr %a.ptr) nounwind {
; SSE2-NEXT: movdqu 16(%rdi), %xmm1
; SSE2-NEXT: movdqu 32(%rdi), %xmm2
; SSE2-NEXT: movdqu 48(%rdi), %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: psubq %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: psubq %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: psubq %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: psubq %xmm4, %xmm3
; SSE2-NEXT: retq
@@ -779,24 +761,20 @@ define <8 x i64> @test_abs_le_v8i64_fold(ptr %a.ptr) nounwind {
; SSSE3-NEXT: movdqu 16(%rdi), %xmm1
; SSSE3-NEXT: movdqu 32(%rdi), %xmm2
; SSSE3-NEXT: movdqu 48(%rdi), %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm4, %xmm0
; SSSE3-NEXT: psubq %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm4, %xmm1
; SSSE3-NEXT: psubq %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: psubq %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm4, %xmm3
; SSSE3-NEXT: psubq %xmm4, %xmm3
; SSSE3-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll
index fefc5a2e9b5b..cc37f2c0c548 100644
--- a/llvm/test/CodeGen/X86/vselect-zero.ll
+++ b/llvm/test/CodeGen/X86/vselect-zero.ll
@@ -308,8 +308,8 @@ define <4 x i32> @signbit_mask_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: signbit_mask_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -343,8 +343,8 @@ define <2 x i64> @signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <2 x i64> @signbit_mask_swap_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: signbit_mask_swap_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -521,11 +521,11 @@ define <8 x i32> @signbit_mask_swap_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <4 x i64> @signbit_mask_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: signbit_mask_v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: retq
;
@@ -673,8 +673,8 @@ define <4 x i32> @signbit_setmask_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @signbit_setmask_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: signbit_setmask_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -815,11 +815,11 @@ define <8 x i32> @signbit_setmask_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <4 x i64> @signbit_setmask_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: signbit_setmask_v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
;
@@ -867,11 +867,11 @@ define <4 x i64> @signbit_setmask_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @signbit_setmask_swap_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: signbit_setmask_swap_v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index c444b5054b0e..3fd74a253b0f 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -484,8 +484,8 @@ define <2 x i64> @shrunkblend_2uses(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b,
; SSE2-LABEL: shrunkblend_2uses:
; SSE2: # %bb.0:
; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pandn %xmm2, %xmm5
; SSE2-NEXT: pand %xmm0, %xmm1
@@ -523,8 +523,8 @@ define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i
; SSE2-LABEL: shrunkblend_nonvselectuse:
; SSE2: # %bb.0:
; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm1
@@ -536,8 +536,8 @@ define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i
; SSE41: # %bb.0:
; SSE41-NEXT: psllq $63, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vshift-3.ll b/llvm/test/CodeGen/X86/vshift-3.ll
index 1a0aee85b4a9..8d472f00b488 100644
--- a/llvm/test/CodeGen/X86/vshift-3.ll
+++ b/llvm/test/CodeGen/X86/vshift-3.ll
@@ -11,20 +11,20 @@ define void @shift1a(<2 x i64> %val, ptr %dst) nounwind {
; X86-LABEL: shift1a:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X86-NEXT: psrad $31, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-NEXT: movdqa %xmm1, (%eax)
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrad $31, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: shift1a:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X64-NEXT: psrad $31, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movdqa %xmm1, (%rdi)
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psrad $31, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
entry:
%ashr = ashr <2 x i64> %val, < i64 32, i64 32 >
diff --git a/llvm/test/CodeGen/X86/vsplit-and.ll b/llvm/test/CodeGen/X86/vsplit-and.ll
index c2e4d350d4ca..85fc1447b9de 100644
--- a/llvm/test/CodeGen/X86/vsplit-and.ll
+++ b/llvm/test/CodeGen/X86/vsplit-and.ll
@@ -43,8 +43,8 @@ define void @t2(ptr %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {
; CHECK-NEXT: andnps %xmm1, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; CHECK-NEXT: psllq $63, %xmm0
-; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: pmovsxdq %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, (%rdi)
; CHECK-NEXT: movq %xmm0, 16(%rdi)
More information about the llvm-commits
mailing list