[llvm] c3bf6d2 - [X86] Fold PSHUF(VSHIFT(X,Y)) -> VSHIFT(PSHUF(X),Y)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sat Apr 22 12:02:47 PDT 2023


Author: Simon Pilgrim
Date: 2023-04-22T20:02:27+01:00
New Revision: c3bf6d20ac306b829dc99939b3a8f9487f7f1c9a

URL: https://github.com/llvm/llvm-project/commit/c3bf6d20ac306b829dc99939b3a8f9487f7f1c9a
DIFF: https://github.com/llvm/llvm-project/commit/c3bf6d20ac306b829dc99939b3a8f9487f7f1c9a.diff

LOG: [X86] Fold PSHUF(VSHIFT(X,Y)) -> VSHIFT(PSHUF(X),Y)

PSHUFD/PSHUFLW/PSHUFHW can act as a vector move / folded load, notably helping simplify pre-AVX cases in particular.

This is a much milder alternative to refactoring canonicalizeShuffleWithBinOps to support SSE shifts nodes.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/abds-vector-128.ll
    llvm/test/CodeGen/X86/abdu-vector-128.ll
    llvm/test/CodeGen/X86/avx512-cmp.ll
    llvm/test/CodeGen/X86/combine-abs.ll
    llvm/test/CodeGen/X86/combine-mul.ll
    llvm/test/CodeGen/X86/combine-ptest.ll
    llvm/test/CodeGen/X86/combine-sdiv.ll
    llvm/test/CodeGen/X86/freeze-binary.ll
    llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
    llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
    llvm/test/CodeGen/X86/known-signbits-vector.ll
    llvm/test/CodeGen/X86/packss.ll
    llvm/test/CodeGen/X86/pr32907.ll
    llvm/test/CodeGen/X86/promote-cmp.ll
    llvm/test/CodeGen/X86/rotate_vec.ll
    llvm/test/CodeGen/X86/sadd_sat_vec.ll
    llvm/test/CodeGen/X86/sdiv_fix_sat.ll
    llvm/test/CodeGen/X86/select-sra.ll
    llvm/test/CodeGen/X86/shift-logic.ll
    llvm/test/CodeGen/X86/shrink_vmul.ll
    llvm/test/CodeGen/X86/shuffle-of-shift.ll
    llvm/test/CodeGen/X86/ssub_sat_vec.ll
    llvm/test/CodeGen/X86/vec_shift5.ll
    llvm/test/CodeGen/X86/vector-bo-select.ll
    llvm/test/CodeGen/X86/vector-sext.ll
    llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
    llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
    llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
    llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
    llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
    llvm/test/CodeGen/X86/viabs.ll
    llvm/test/CodeGen/X86/vselect-zero.ll
    llvm/test/CodeGen/X86/vselect.ll
    llvm/test/CodeGen/X86/vshift-3.ll
    llvm/test/CodeGen/X86/vsplit-and.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e07b597781f4..533542f46477 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42075,10 +42075,37 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   }
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFLW:
-  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFHW: {
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    if (N0->hasOneUse()) {
+      SDValue V = peekThroughOneUseBitcasts(N0);
+      switch (V.getOpcode()) {
+      case X86ISD::VSHL:
+      case X86ISD::VSRL:
+      case X86ISD::VSRA:
+      case X86ISD::VSHLI:
+      case X86ISD::VSRLI:
+      case X86ISD::VSRAI:
+      case X86ISD::VROTLI:
+      case X86ISD::VROTRI: {
+        MVT InnerVT = V.getSimpleValueType();
+        if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
+          SDValue Res = DAG.getNode(Opcode, DL, VT,
+                                    DAG.getBitcast(VT, V.getOperand(0)), N1);
+          Res = DAG.getBitcast(InnerVT, Res);
+          Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
+          return DAG.getBitcast(VT, Res);
+        }
+        break;
+      }
+      }
+    }
+
     Mask = getPSHUFShuffleMask(N);
     assert(Mask.size() == 4);
     break;
+  }
   case X86ISD::MOVSD:
   case X86ISD::MOVSH:
   case X86ISD::MOVSS: {

diff  --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll
index a48781c6ebf8..4e0bbb8115c9 100644
--- a/llvm/test/CodeGen/X86/abds-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-128.ll
@@ -81,44 +81,36 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    psubq %xmm10, %xmm7
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
 ; SSE2-NEXT:    psubq %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
 ; SSE2-NEXT:    psubq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm5
 ; SSE2-NEXT:    psubq %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm6
 ; SSE2-NEXT:    psubq %xmm1, %xmm6
-; SSE2-NEXT:    movdqa %xmm7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm7
 ; SSE2-NEXT:    psubq %xmm1, %xmm7
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm8
 ; SSE2-NEXT:    psubq %xmm1, %xmm8
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
@@ -233,44 +225,36 @@ define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    psubq %xmm10, %xmm7
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
 ; SSE2-NEXT:    psubq %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
 ; SSE2-NEXT:    psubq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm5
 ; SSE2-NEXT:    psubq %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm6
 ; SSE2-NEXT:    psubq %xmm1, %xmm6
-; SSE2-NEXT:    movdqa %xmm7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm7
 ; SSE2-NEXT:    psubq %xmm1, %xmm7
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm8
 ; SSE2-NEXT:    psubq %xmm1, %xmm8
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
@@ -378,14 +362,12 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
 ; SSE2-NEXT:    psubq %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm2
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
@@ -432,14 +414,12 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
 ; SSE2-NEXT:    psubq %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm2
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
@@ -1023,9 +1003,8 @@ define <2 x i64> @abd_subnsw_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_subnsw_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
 ; SSE2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index 88496032aa30..5281fc78b025 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -81,24 +81,20 @@ define <8 x i16> @abd_ext_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-NEXT:    psubq %xmm6, %xmm4
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
 ; SSE2-NEXT:    psubq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -162,24 +158,20 @@ define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-NEXT:    psubq %xmm6, %xmm4
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
 ; SSE2-NEXT:    psubq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -229,14 +221,12 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
@@ -276,14 +266,12 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]

diff  --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll
index 06da8f7631d4..919edb334b36 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp.ll
@@ -191,8 +191,8 @@ define <8 x i32> @legalize_loop(<8 x double> %arg) {
 ; KNL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; KNL-NEXT:    vcmpnltpd %zmm0, %zmm1, %k1
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpsrld $31, %ymm0, %ymm1
-; KNL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; KNL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[3,2,1,0,7,6,5,4]
+; KNL-NEXT:    vpsrld $31, %ymm1, %ymm1
 ; KNL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
 ; KNL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
@@ -202,8 +202,8 @@ define <8 x i32> @legalize_loop(<8 x double> %arg) {
 ; SKX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; SKX-NEXT:    vcmpnltpd %zmm0, %zmm1, %k0
 ; SKX-NEXT:    vpmovm2d %k0, %ymm0
-; SKX-NEXT:    vpsrld $31, %ymm0, %ymm1
-; SKX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[3,2,1,0,7,6,5,4]
+; SKX-NEXT:    vpsrld $31, %ymm1, %ymm1
 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
 ; SKX-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; SKX-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/combine-abs.ll b/llvm/test/CodeGen/X86/combine-abs.ll
index de20b4dccb20..410218b33eb9 100644
--- a/llvm/test/CodeGen/X86/combine-abs.ll
+++ b/llvm/test/CodeGen/X86/combine-abs.ll
@@ -107,14 +107,12 @@ define <32 x i8> @combine_v32i8_abs_abs(<32 x i8> %a) {
 define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) {
 ; SSE2-LABEL: combine_v4i64_abs_abs:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    psubq %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psubq %xmm2, %xmm1
 ; SSE2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 9bac05eccabd..9d7afb9478b1 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -325,9 +325,8 @@ define <16 x i8> @combine_mul_to_abs_v16i8(<16 x i8> %x) {
 define <2 x i64> @combine_mul_to_abs_v2i64(<2 x i64> %x) {
 ; SSE-LABEL: combine_mul_to_abs_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    psrlq $32, %xmm2

diff  --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll
index 7d11745e0503..337edef96bee 100644
--- a/llvm/test/CodeGen/X86/combine-ptest.ll
+++ b/llvm/test/CodeGen/X86/combine-ptest.ll
@@ -265,9 +265,9 @@ define i32 @ptestz_v2i64_signbits(<2 x i64> %c, i32 %a, i32 %b) {
 ; SSE41-LABEL: ptestz_v2i64_signbits:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movl %edi, %eax
-; SSE41-NEXT:    psrad $31, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    ptest %xmm0, %xmm0
+; SSE41-NEXT:    movmskps %xmm0, %ecx
+; SSE41-NEXT:    testl %ecx, %ecx
 ; SSE41-NEXT:    cmovnel %esi, %eax
 ; SSE41-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index c8bee6bdcca5..80455eabe9f6 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -1540,9 +1540,8 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    psrlq $62, %xmm1
 ; SSE2-NEXT:    paddq %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    psrad $2, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE2-NEXT:    psrlq $2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -1625,16 +1624,14 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    psrlq $62, %xmm2
 ; SSE2-NEXT:    paddq %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
 ; SSE2-NEXT:    psrad $2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; SSE2-NEXT:    psrlq $2, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    psrlq $61, %xmm3
 ; SSE2-NEXT:    psrlq $60, %xmm2
@@ -1660,9 +1657,8 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; SSE41-NEXT:    psrlq $2, %xmm2
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    psrad $31, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    psrlq $60, %xmm3
 ; SSE41-NEXT:    psrlq $61, %xmm2
@@ -1761,9 +1757,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    psrlq $62, %xmm4
 ; SSE2-NEXT:    paddq %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
 ; SSE2-NEXT:    psrad $2, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
 ; SSE2-NEXT:    psrlq $2, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
@@ -1772,16 +1767,14 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    psrlq $62, %xmm4
 ; SSE2-NEXT:    paddq %xmm2, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
 ; SSE2-NEXT:    psrad $2, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
 ; SSE2-NEXT:    psrlq $2, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; SSE2-NEXT:    psrlq $61, %xmm5
 ; SSE2-NEXT:    psrlq $60, %xmm4
@@ -1794,9 +1787,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
 ; SSE2-NEXT:    xorpd %xmm4, %xmm1
 ; SSE2-NEXT:    psubq %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm6
 ; SSE2-NEXT:    psrlq $61, %xmm6
 ; SSE2-NEXT:    psrlq $60, %xmm5
@@ -1830,9 +1822,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; SSE41-NEXT:    psrlq $2, %xmm4
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    psrad $31, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    movdqa %xmm4, %xmm5
 ; SSE41-NEXT:    psrlq $60, %xmm5
 ; SSE41-NEXT:    psrlq $61, %xmm4
@@ -1845,9 +1836,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
 ; SSE41-NEXT:    pxor %xmm4, %xmm1
 ; SSE41-NEXT:    psubq %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    psrad $31, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE41-NEXT:    movdqa %xmm5, %xmm6
 ; SSE41-NEXT:    psrlq $60, %xmm6
 ; SSE41-NEXT:    psrlq $61, %xmm5

diff  --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index ea6e1fb36620..defd81e6ab77 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -541,8 +541,8 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_ashr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    psrad $1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; X86-NEXT:    psrad $1, %xmm0
 ; X86-NEXT:    psrad $2, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -650,8 +650,8 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_lshr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    psrld $1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; X86-NEXT:    psrld $1, %xmm0
 ; X86-NEXT:    psrld $2, %xmm0
 ; X86-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
index f04aa2cd9ba3..6f2be411217b 100644
--- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
+++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
@@ -129,14 +129,12 @@ define <4 x i1> @illegal_abs_to_eq_or(<4 x i64> %x) {
 ;
 ; SSE2-LABEL: illegal_abs_to_eq_or:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    psubq %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psubq %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [129,129]
@@ -189,14 +187,12 @@ define <4 x i64> @illegal_abs_to_eq_or_sext(<4 x i64> %x) {
 ;
 ; SSE2-LABEL: illegal_abs_to_eq_or_sext:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psubq %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    psubq %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [129,129]
@@ -259,14 +255,12 @@ define <4 x i1> @illegal_abs_to_ne_and(<4 x i64> %x) {
 ;
 ; SSE2-LABEL: illegal_abs_to_ne_and:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    psubq %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psubq %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [129,129]
@@ -328,14 +322,12 @@ define <4 x i64> @illegal_abs_to_ne_and_sext(<4 x i64> %x) {
 ;
 ; SSE2-LABEL: illegal_abs_to_ne_and_sext:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psubq %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    psubq %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [129,129]
@@ -600,8 +592,8 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) {
 ; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; SSE41-NEXT:    psllq $63, %xmm0
-; SSE41-NEXT:    psrad $31, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -628,8 +620,8 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,1,3,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    psllq $63, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    retq
   %cmp1 = icmp eq <4 x i64> %x, <i64 129, i64 129, i64 129, i64 129>
   %cmp2 = icmp eq <4 x i64> %x, <i64 -129, i64 -129, i64 -129, i64 -129>
@@ -744,8 +736,8 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) {
 ; SSE41-NEXT:    pmovsxdq %xmm2, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
 ; SSE41-NEXT:    psllq $63, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; SSE2-LABEL: ne_and_to_abs_vec4x64_sext:
@@ -774,8 +766,8 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,1,3,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    psllq $63, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    retq
   %cmp1 = icmp ne <4 x i64> %x, <i64 129, i64 129, i64 129, i64 129>
   %cmp2 = icmp ne <4 x i64> %x, <i64 -129, i64 -129, i64 -129, i64 -129>

diff  --git a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
index df1cfb8337a0..02078c3575dd 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
@@ -253,9 +253,8 @@ define <2 x i1> @abs_ne_vec(<2 x i64> %0) nounwind {
 ;
 ; X64-LABEL: abs_ne_vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    psrad $31, %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X64-NEXT:    pxor %xmm1, %xmm0
 ; X64-NEXT:    psubq %xmm1, %xmm0
 ; X64-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0

diff  --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index 6b9491500b75..25f02ac6732c 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -192,8 +192,8 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
 ; X86-LABEL: signbits_ashr_shl_extract_sitofp:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X86-NEXT:    vpsllq $20, %xmm0, %xmm0
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
@@ -203,8 +203,8 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
 ;
 ; X64-LABEL: signbits_ashr_shl_extract_sitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X64-NEXT:    vpsllq $20, %xmm0, %xmm0
 ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
@@ -338,8 +338,8 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
 ; X86-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
@@ -350,8 +350,8 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
 ;
 ; X64-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X64-NEXT:    vmovd %edi, %xmm1
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
@@ -371,8 +371,8 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
 ; X86-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X86-NEXT:    vpxor %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
@@ -382,8 +382,8 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
 ;
 ; X64-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    vpsrad $29, %xmm0, %xmm0
 ; X64-NEXT:    vpxor %xmm0, %xmm1, %xmm0
 ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
@@ -407,13 +407,13 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    vmovapd 8(%ebp), %xmm3
 ; X86-NEXT:    vpsrad $31, %xmm2, %xmm4
-; X86-NEXT:    vpsrad $1, %xmm2, %xmm5
-; X86-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X86-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; X86-NEXT:    vpsrad $1, %xmm5, %xmm5
 ; X86-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
 ; X86-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; X86-NEXT:    vpsrad $31, %xmm2, %xmm5
-; X86-NEXT:    vpsrad $1, %xmm2, %xmm2
 ; X86-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-NEXT:    vpsrad $1, %xmm2, %xmm2
 ; X86-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
 ; X86-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
 ; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm6
@@ -435,13 +435,13 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X64-AVX1-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
 ; X64-AVX1:       # %bb.0:
 ; X64-AVX1-NEXT:    vpsrad $31, %xmm2, %xmm4
-; X64-AVX1-NEXT:    vpsrad $1, %xmm2, %xmm5
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; X64-AVX1-NEXT:    vpsrad $1, %xmm5, %xmm5
 ; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; X64-AVX1-NEXT:    vpsrad $31, %xmm2, %xmm5
-; X64-AVX1-NEXT:    vpsrad $1, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-AVX1-NEXT:    vpsrad $1, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
 ; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
 ; X64-AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm6
@@ -460,8 +460,8 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ;
 ; X64-AVX2-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpsrad $1, %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; X64-AVX2-NEXT:    vpsrad $1, %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 ; X64-AVX2-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm3, %ymm0

diff  --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll
index 4475e5480a88..821d233b8237 100644
--- a/llvm/test/CodeGen/X86/packss.ll
+++ b/llvm/test/CodeGen/X86/packss.ll
@@ -9,10 +9,10 @@
 define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
 ; SSE-LABEL: trunc_ashr_v4i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psrad $31, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT:    psrad $31, %xmm0
+; SSE-NEXT:    psrad $31, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    psrad $31, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
 ; SSE-NEXT:    ret{{[l|q]}}
 ;
@@ -42,17 +42,15 @@ define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
 define <8 x i16> @trunc_ashr_v4i64_bitcast(<4 x i64> %a0) {
 ; SSE-LABEL: trunc_ashr_v4i64_bitcast:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psrad $31, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE-NEXT:    psrad $17, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    psrad $31, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE-NEXT:    psrad $17, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
 ; SSE-NEXT:    ret{{[l|q]}}
@@ -61,12 +59,12 @@ define <8 x i16> @trunc_ashr_v4i64_bitcast(<4 x i64> %a0) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT:    vpsrad $17, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpsrad $17, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
@@ -75,8 +73,8 @@ define <8 x i16> @trunc_ashr_v4i64_bitcast(<4 x i64> %a0) {
 ; AVX2-LABEL: trunc_ashr_v4i64_bitcast:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm1
-; AVX2-NEXT:    vpsrad $17, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT:    vpsrad $17, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/pr32907.ll b/llvm/test/CodeGen/X86/pr32907.ll
index 5d7b282b58fb..43abf1fd9754 100644
--- a/llvm/test/CodeGen/X86/pr32907.ll
+++ b/llvm/test/CodeGen/X86/pr32907.ll
@@ -8,9 +8,8 @@ define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) {
 ; SSE2-LABEL: PR32907:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    psubq %xmm0, %xmm1
 ; SSE2-NEXT:    pand %xmm2, %xmm1

diff  --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll
index bc824d4fd5e3..529396ca4617 100644
--- a/llvm/test/CodeGen/X86/promote-cmp.ll
+++ b/llvm/test/CodeGen/X86/promote-cmp.ll
@@ -37,8 +37,8 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[2,1,3,3]
 ; SSE2-NEXT:    psllq $63, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm3, %xmm2
 ; SSE2-NEXT:    por %xmm2, %xmm1

diff  --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll
index ff1bcd6437d2..ae74e9e6e042 100644
--- a/llvm/test/CodeGen/X86/rotate_vec.ll
+++ b/llvm/test/CodeGen/X86/rotate_vec.ll
@@ -135,8 +135,8 @@ define <4 x i32> @rot_v4i32_mask_ashr0(<4 x i32> %a0) {
 define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
 ; XOPAVX1-LABEL: rot_v4i32_mask_ashr1:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpsrad $25, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT:    vpsrad $25, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index 4abb2307a4a6..ab8a8d3bfc5b 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -1134,13 +1134,13 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i64:
@@ -1162,13 +1162,13 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i64:
@@ -1244,108 +1244,106 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-LABEL: v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    paddq %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm5, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm6, %xmm0
+; SSE2-NEXT:    paddq %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm6
-; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm7, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pxor %xmm5, %xmm0
-; SSE2-NEXT:    pand %xmm7, %xmm0
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm2
 ; SSE2-NEXT:    paddq %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm6, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm6
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm5, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    paddq %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm6
-; SSSE3-NEXT:    movdqa %xmm5, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm6, %xmm0
+; SSSE3-NEXT:    paddq %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pxor %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm0, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT:    pxor %xmm6, %xmm7
-; SSSE3-NEXT:    movdqa %xmm7, %xmm6
-; SSSE3-NEXT:    pandn %xmm0, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm7, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pxor %xmm5, %xmm0
-; SSSE3-NEXT:    pand %xmm7, %xmm0
-; SSSE3-NEXT:    por %xmm6, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm6
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pandn %xmm4, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm6, %xmm2
 ; SSSE3-NEXT:    paddq %xmm3, %xmm1
-; SSSE3-NEXT:    pxor %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa %xmm6, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSSE3-NEXT:    pxor %xmm1, %xmm6
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm3
-; SSSE3-NEXT:    pandn %xmm1, %xmm3
-; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm5, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pandn %xmm1, %xmm5
+; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i64:
@@ -1448,204 +1446,202 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-LABEL: v8i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    paddq %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NEXT:    pxor %xmm8, %xmm10
-; SSE2-NEXT:    movdqa %xmm9, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm9
+; SSE2-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm11
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT:    por %xmm9, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm11
-; SSE2-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NEXT:    movdqa %xmm11, %xmm10
-; SSE2-NEXT:    pandn %xmm0, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm11, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pxor %xmm9, %xmm0
-; SSE2-NEXT:    pand %xmm11, %xmm0
-; SSE2-NEXT:    por %xmm10, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    pxor %xmm8, %xmm10
-; SSE2-NEXT:    paddq %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm11
-; SSE2-NEXT:    pxor %xmm8, %xmm11
-; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    paddq %xmm5, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm12
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm10, %xmm10
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm10
-; SSE2-NEXT:    pxor %xmm11, %xmm10
-; SSE2-NEXT:    movdqa %xmm10, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm9, %xmm1
-; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pxor %xmm11, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm5
 ; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
 ; SSE2-NEXT:    paddq %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm10
-; SSE2-NEXT:    pxor %xmm8, %xmm10
-; SSE2-NEXT:    movdqa %xmm5, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm10, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm5
-; SSE2-NEXT:    pandn %xmm2, %xmm5
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm9, %xmm2
-; SSE2-NEXT:    pand %xmm6, %xmm2
-; SSE2-NEXT:    por %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    pxor %xmm11, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm10, %xmm2
 ; SSE2-NEXT:    paddq %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm3, %xmm8
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pxor %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm3, %xmm5
-; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm9, %xmm3
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm5, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm5
+; SSE2-NEXT:    pxor %xmm11, %xmm5
+; SSE2-NEXT:    pand %xmm9, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm9
-; SSSE3-NEXT:    pxor %xmm8, %xmm9
-; SSSE3-NEXT:    paddq %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm10
-; SSSE3-NEXT:    pxor %xmm8, %xmm10
-; SSSE3-NEXT:    movdqa %xmm9, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSSE3-NEXT:    movdqa %xmm1, %xmm8
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    paddq %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm9
+; SSSE3-NEXT:    pxor %xmm10, %xmm9
+; SSSE3-NEXT:    movdqa %xmm0, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm11
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT:    por %xmm9, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm12, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm9
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pxor %xmm11, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm11
-; SSSE3-NEXT:    pxor %xmm10, %xmm11
-; SSSE3-NEXT:    movdqa %xmm11, %xmm10
-; SSSE3-NEXT:    pandn %xmm0, %xmm10
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm11, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pxor %xmm9, %xmm0
-; SSSE3-NEXT:    pand %xmm11, %xmm0
-; SSSE3-NEXT:    por %xmm10, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm10
-; SSSE3-NEXT:    pxor %xmm8, %xmm10
-; SSSE3-NEXT:    paddq %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm11
-; SSSE3-NEXT:    pxor %xmm8, %xmm11
-; SSSE3-NEXT:    movdqa %xmm10, %xmm12
-; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm11, %xmm0
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pandn %xmm1, %xmm4
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm8, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    paddq %xmm5, %xmm8
+; SSSE3-NEXT:    movdqa %xmm8, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm12
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
-; SSSE3-NEXT:    por %xmm10, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm10, %xmm10
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm10
-; SSSE3-NEXT:    pxor %xmm11, %xmm10
-; SSSE3-NEXT:    movdqa %xmm10, %xmm5
-; SSSE3-NEXT:    pandn %xmm1, %xmm5
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm9, %xmm1
-; SSSE3-NEXT:    pand %xmm10, %xmm1
+; SSSE3-NEXT:    pxor %xmm11, %xmm1
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
 ; SSSE3-NEXT:    por %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm8, %xmm5
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
 ; SSSE3-NEXT:    paddq %xmm6, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm10
-; SSSE3-NEXT:    pxor %xmm8, %xmm10
-; SSSE3-NEXT:    movdqa %xmm5, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm12, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm10, %xmm6
-; SSSE3-NEXT:    movdqa %xmm6, %xmm5
-; SSSE3-NEXT:    pandn %xmm2, %xmm5
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm9, %xmm2
-; SSSE3-NEXT:    pand %xmm6, %xmm2
-; SSSE3-NEXT:    por %xmm5, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; SSSE3-NEXT:    pxor %xmm8, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    pxor %xmm11, %xmm4
+; SSSE3-NEXT:    pand %xmm6, %xmm4
+; SSSE3-NEXT:    pandn %xmm2, %xmm6
+; SSSE3-NEXT:    por %xmm6, %xmm4
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm10, %xmm2
 ; SSSE3-NEXT:    paddq %xmm7, %xmm3
-; SSSE3-NEXT:    pxor %xmm3, %xmm8
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm8
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm10, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pxor %xmm6, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm3, %xmm5
-; SSSE3-NEXT:    psrad $31, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm9, %xmm3
-; SSSE3-NEXT:    pand %xmm4, %xmm3
-; SSSE3-NEXT:    por %xmm5, %xmm3
+; SSSE3-NEXT:    pxor %xmm3, %xmm10
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm9
+; SSSE3-NEXT:    pxor %xmm5, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm5
+; SSSE3-NEXT:    pxor %xmm11, %xmm5
+; SSSE3-NEXT:    pand %xmm9, %xmm5
+; SSSE3-NEXT:    pandn %xmm3, %xmm9
+; SSSE3-NEXT:    por %xmm9, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm5, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i64:

diff  --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index b7387651f8c4..0d3d32572360 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -579,9 +579,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
 ; X64-NEXT:    psllq $32, %xmm3
-; X64-NEXT:    movdqa %xmm3, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
 ; X64-NEXT:    psrad $31, %xmm2
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; X64-NEXT:    psrlq $31, %xmm3
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -703,9 +702,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[0,1,1,3]
 ; X64-NEXT:    psllq $32, %xmm0
-; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 ; X64-NEXT:    psrad $31, %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X64-NEXT:    psrlq $31, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]

diff  --git a/llvm/test/CodeGen/X86/select-sra.ll b/llvm/test/CodeGen/X86/select-sra.ll
index 2a3f8a125e4a..d01d69cd6499 100644
--- a/llvm/test/CodeGen/X86/select-sra.ll
+++ b/llvm/test/CodeGen/X86/select-sra.ll
@@ -89,8 +89,8 @@ define <4 x i32> @isnonneg_v4i32(<4 x i32> %x) {
 define <2 x i64> @isnonneg_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: isnonneg_v2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    psrad $31, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    psrad $31, %xmm0
 ; CHECK-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %cond = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
@@ -186,8 +186,8 @@ define <4 x i32> @isneg_v4i32(<4 x i32> %x) {
 define <2 x i64> @isneg_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: isneg_v2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    psrad $31, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    psrad $31, %xmm0
 ; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %cond = icmp slt <2 x i64> %x, zeroinitializer

diff  --git a/llvm/test/CodeGen/X86/shift-logic.ll b/llvm/test/CodeGen/X86/shift-logic.ll
index d098932672f3..6c96cc4e78f9 100644
--- a/llvm/test/CodeGen/X86/shift-logic.ll
+++ b/llvm/test/CodeGen/X86/shift-logic.ll
@@ -110,15 +110,13 @@ define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: ashr_or:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-NEXT:    psrad $7, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; CHECK-NEXT:    psrlq $7, %xmm1
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
 ; CHECK-NEXT:    psrad $12, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; CHECK-NEXT:    psrlq $12, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]

diff  --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 26e2382368d1..0177acfc9248 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1921,9 +1921,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X86-SSE-NEXT:    psrad $16, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    psllq $32, %xmm0
 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
@@ -1944,9 +1942,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X64-SSE-NEXT:    psrad $16, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE-NEXT:    psllq $32, %xmm0
 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)

diff  --git a/llvm/test/CodeGen/X86/shuffle-of-shift.ll b/llvm/test/CodeGen/X86/shuffle-of-shift.ll
index 19a44cef805d..e2dc74d4e4df 100644
--- a/llvm/test/CodeGen/X86/shuffle-of-shift.ll
+++ b/llvm/test/CodeGen/X86/shuffle-of-shift.ll
@@ -9,14 +9,14 @@
 define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x) nounwind {
 ; SSE2-LABEL: shuffle_i32_of_shl_i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psllw $15, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT:    psllw $15, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i32_of_shl_i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15)
   %i2 = bitcast <8 x i16> %i1 to <4 x i32>
@@ -26,14 +26,14 @@ define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x) nounwind {
 define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x) nounwind {
 ; SSE2-LABEL: shuffle_i32_of_lshr_i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrlw $15, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT:    psrlw $15, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i32_of_lshr_i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15)
   %i2 = bitcast <8 x i16> %i1 to <4 x i32>
@@ -43,14 +43,14 @@ define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x) nounwind {
 define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x) nounwind {
 ; SSE2-LABEL: shuffle_i32_of_ashr_i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psraw $15, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT:    psraw $15, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i32_of_ashr_i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15)
   %i2 = bitcast <8 x i16> %i1 to <4 x i32>
@@ -61,14 +61,14 @@ define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x) nounwind {
 define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x) nounwind {
 ; SSE2-LABEL: shuffle_i32_of_shl_i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pslld $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT:    pslld $31, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i32_of_shl_i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31)
   %i2 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -77,14 +77,14 @@ define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x) nounwind {
 define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x) nounwind {
 ; SSE2-LABEL: shuffle_i32_of_lshr_i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrld $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT:    psrld $31, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i32_of_lshr_i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31)
   %i2 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -93,14 +93,14 @@ define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x) nounwind {
 define <4 x i32> @shuffle_i32_of_ashr_i32(<4 x i32> %x) nounwind {
 ; SSE2-LABEL: shuffle_i32_of_ashr_i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i32_of_ashr_i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31)
   %i2 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -186,14 +186,14 @@ define <4 x i32> @shuffle_i32_of_ashr_i64(<2 x i64> %x) nounwind {
 define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x) nounwind {
 ; SSE2-LABEL: shuffle_i64_of_shl_i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psllw $15, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psllw $15, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i64_of_shl_i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15)
   %i2 = bitcast <8 x i16> %i1 to <2 x i64>
@@ -203,14 +203,14 @@ define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x) nounwind {
 define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x) nounwind {
 ; SSE2-LABEL: shuffle_i64_of_lshr_i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrlw $15, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psrlw $15, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i64_of_lshr_i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15)
   %i2 = bitcast <8 x i16> %i1 to <2 x i64>
@@ -220,14 +220,14 @@ define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x) nounwind {
 define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x) nounwind {
 ; SSE2-LABEL: shuffle_i64_of_ashr_i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psraw $15, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psraw $15, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i64_of_ashr_i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15)
   %i2 = bitcast <8 x i16> %i1 to <2 x i64>
@@ -238,14 +238,14 @@ define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x) nounwind {
 define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x) nounwind {
 ; SSE2-LABEL: shuffle_i64_of_shl_i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pslld $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pslld $31, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i64_of_shl_i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31)
   %i2 = bitcast <4 x i32> %i1 to <2 x i64>
@@ -255,14 +255,14 @@ define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x) nounwind {
 define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x) nounwind {
 ; SSE2-LABEL: shuffle_i64_of_lshr_i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrld $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psrld $31, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i64_of_lshr_i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31)
   %i2 = bitcast <4 x i32> %i1 to <2 x i64>
@@ -272,14 +272,14 @@ define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x) nounwind {
 define <2 x i64> @shuffle_i64_of_ashr_i32(<4 x i32> %x) nounwind {
 ; SSE2-LABEL: shuffle_i64_of_ashr_i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: shuffle_i64_of_ashr_i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
   %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31)
   %i2 = bitcast <4 x i32> %i1 to <2 x i64>

diff  --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 111b7e763e18..3c87f3b06a77 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -1210,13 +1210,13 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    por %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i64:
@@ -1245,13 +1245,13 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm1, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i64:
@@ -1341,134 +1341,132 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-LABEL: v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    psubq %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm5, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm7
 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm7, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm6, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
 ; SSE2-NEXT:    psubq %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm5, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm7, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    psubq %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm6
-; SSSE3-NEXT:    movdqa %xmm5, %xmm7
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm5, %xmm0
+; SSSE3-NEXT:    psubq %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
+; SSSE3-NEXT:    movdqa %xmm0, %xmm7
 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm5
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
+; SSSE3-NEXT:    por %xmm0, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm7, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm6, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pandn %xmm0, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm7
+; SSSE3-NEXT:    pxor %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    pand %xmm5, %xmm0
-; SSSE3-NEXT:    por %xmm6, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pandn %xmm4, %xmm7
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
 ; SSSE3-NEXT:    psubq %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm6
-; SSSE3-NEXT:    movdqa %xmm5, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm2, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm5, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm7, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm6, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm3
-; SSSE3-NEXT:    pandn %xmm1, %xmm3
-; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm6, %xmm2
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pandn %xmm1, %xmm5
+; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i64:
@@ -1592,254 +1590,252 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-LABEL: v8i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    psubq %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NEXT:    pxor %xmm8, %xmm10
-; SSE2-NEXT:    movdqa %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm9, %xmm0
+; SSE2-NEXT:    psubq %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NEXT:    pcmpgtd %xmm10, %xmm11
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm9
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT:    por %xmm9, %xmm10
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm11, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm9
-; SSE2-NEXT:    pxor %xmm10, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, %xmm10
-; SSE2-NEXT:    pandn %xmm0, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm9, %xmm0
-; SSE2-NEXT:    por %xmm10, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    psubq %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    pxor %xmm8, %xmm10
-; SSE2-NEXT:    movdqa %xmm9, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm1
+; SSE2-NEXT:    psubq %xmm5, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NEXT:    pxor %xmm9, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT:    por %xmm9, %xmm10
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm11, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm9
-; SSE2-NEXT:    pxor %xmm10, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm9, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    pand %xmm11, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm9, %xmm4
 ; SSE2-NEXT:    psubq %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    movdqa %xmm5, %xmm10
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
-; SSE2-NEXT:    pand %xmm11, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm10, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    pxor %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm2, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm8
+; SSE2-NEXT:    pxor %xmm5, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm9, %xmm2
 ; SSE2-NEXT:    psubq %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    pxor %xmm8, %xmm6
-; SSE2-NEXT:    movdqa %xmm5, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pand %xmm9, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm7, %xmm5
-; SSE2-NEXT:    pxor %xmm6, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm3, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm5, %xmm3
-; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm5
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm9
-; SSSE3-NEXT:    pxor %xmm8, %xmm9
-; SSSE3-NEXT:    psubq %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm10
-; SSSE3-NEXT:    pxor %xmm8, %xmm10
-; SSSE3-NEXT:    movdqa %xmm9, %xmm11
+; SSSE3-NEXT:    movdqa %xmm1, %xmm8
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm9, %xmm0
+; SSSE3-NEXT:    psubq %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm10
+; SSSE3-NEXT:    pxor %xmm9, %xmm10
+; SSSE3-NEXT:    movdqa %xmm0, %xmm11
 ; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm11
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm9
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm12, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT:    por %xmm9, %xmm10
-; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm0, %xmm10
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm11, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm9
-; SSSE3-NEXT:    pxor %xmm10, %xmm9
-; SSSE3-NEXT:    movdqa %xmm9, %xmm10
-; SSSE3-NEXT:    pandn %xmm0, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm11
+; SSSE3-NEXT:    pxor %xmm10, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pand %xmm9, %xmm0
-; SSSE3-NEXT:    por %xmm10, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm9
-; SSSE3-NEXT:    pxor %xmm8, %xmm9
-; SSSE3-NEXT:    psubq %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm10
-; SSSE3-NEXT:    pxor %xmm8, %xmm10
-; SSSE3-NEXT:    movdqa %xmm9, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    pand %xmm11, %xmm0
+; SSSE3-NEXT:    pandn %xmm1, %xmm11
+; SSSE3-NEXT:    por %xmm11, %xmm0
+; SSSE3-NEXT:    movdqa %xmm8, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
+; SSSE3-NEXT:    psubq %xmm5, %xmm8
+; SSSE3-NEXT:    movdqa %xmm8, %xmm4
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm11
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT:    por %xmm9, %xmm10
-; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm5
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm12, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm11, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm9
-; SSSE3-NEXT:    pxor %xmm10, %xmm9
-; SSSE3-NEXT:    movdqa %xmm9, %xmm5
-; SSSE3-NEXT:    pandn %xmm1, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm11
+; SSSE3-NEXT:    pxor %xmm4, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    pand %xmm9, %xmm1
-; SSSE3-NEXT:    por %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm8, %xmm5
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    pand %xmm11, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm11
+; SSSE3-NEXT:    por %xmm11, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
 ; SSSE3-NEXT:    psubq %xmm6, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm9
-; SSSE3-NEXT:    pxor %xmm8, %xmm9
-; SSSE3-NEXT:    movdqa %xmm5, %xmm10
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm11, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm9
-; SSSE3-NEXT:    pxor %xmm8, %xmm6
-; SSSE3-NEXT:    movdqa %xmm6, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm10, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
 ; SSSE3-NEXT:    pxor %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pandn %xmm2, %xmm6
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    pand %xmm5, %xmm2
-; SSSE3-NEXT:    por %xmm6, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; SSSE3-NEXT:    pxor %xmm8, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm11, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm6
+; SSSE3-NEXT:    movdqa %xmm6, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm8
+; SSSE3-NEXT:    pxor %xmm5, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    pand %xmm8, %xmm4
+; SSSE3-NEXT:    pandn %xmm2, %xmm8
+; SSSE3-NEXT:    por %xmm8, %xmm4
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm9, %xmm2
 ; SSSE3-NEXT:    psubq %xmm7, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm6
-; SSSE3-NEXT:    pxor %xmm8, %xmm6
-; SSSE3-NEXT:    movdqa %xmm5, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
+; SSSE3-NEXT:    movdqa %xmm2, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm10, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm8, %xmm7
-; SSSE3-NEXT:    movdqa %xmm7, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm7
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm7
+; SSSE3-NEXT:    movdqa %xmm7, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm9, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm7, %xmm5
-; SSSE3-NEXT:    pxor %xmm6, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pandn %xmm3, %xmm6
-; SSSE3-NEXT:    psrad $31, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    pand %xmm5, %xmm3
-; SSSE3-NEXT:    por %xmm6, %xmm3
+; SSSE3-NEXT:    pand %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm5
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
+; SSSE3-NEXT:    pand %xmm2, %xmm5
+; SSSE3-NEXT:    pandn %xmm3, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm5, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i64:

diff  --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll
index b7cfdeb7aa5a..f8bc6b01c70a 100644
--- a/llvm/test/CodeGen/X86/vec_shift5.ll
+++ b/llvm/test/CodeGen/X86/vec_shift5.ll
@@ -258,8 +258,8 @@ define i32 @extelt1_add_psrai_v4i32_uses(<4 x i32> %x, <4 x i32> %y){
 ; CHECK-NEXT:    movd %xmm1, %ecx
 ; CHECK-NEXT:    addl $3, %ecx
 ; CHECK-NEXT:    movd %ecx, %xmm1
-; CHECK-NEXT:    psrad %xmm1, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT:    psrad %xmm1, %xmm0
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    imull %ecx, %eax
 ; CHECK-NEXT:    ret{{[l|q]}}

diff  --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll
index e1d10688a422..d9b4432b9ce9 100644
--- a/llvm/test/CodeGen/X86/vector-bo-select.ll
+++ b/llvm/test/CodeGen/X86/vector-bo-select.ll
@@ -5527,8 +5527,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x,
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
 ; SSE2-NEXT:    psllq $63, %xmm8
-; SSE2-NEXT:    psrad $31, %xmm8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm8
 ; SSE2-NEXT:    movdqa %xmm8, %xmm10
 ; SSE2-NEXT:    pandn %xmm7, %xmm10
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [1,1]
@@ -5537,8 +5537,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x,
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
 ; SSE2-NEXT:    psllq $63, %xmm7
-; SSE2-NEXT:    psrad $31, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm7
 ; SSE2-NEXT:    movdqa %xmm7, %xmm10
 ; SSE2-NEXT:    pandn %xmm6, %xmm10
 ; SSE2-NEXT:    pand %xmm9, %xmm7
@@ -5546,8 +5546,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x,
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
 ; SSE2-NEXT:    psllq $63, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm6
 ; SSE2-NEXT:    movdqa %xmm6, %xmm10
 ; SSE2-NEXT:    pandn %xmm5, %xmm10
 ; SSE2-NEXT:    pand %xmm9, %xmm6
@@ -5555,8 +5555,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x,
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
 ; SSE2-NEXT:    psllq $63, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm5
 ; SSE2-NEXT:    pand %xmm5, %xmm9
 ; SSE2-NEXT:    pandn {{[0-9]+}}(%rsp), %xmm5
 ; SSE2-NEXT:    por %xmm9, %xmm5

diff  --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 1c9a31b512b4..ed7d22356de8 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -1777,12 +1777,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
 ; SSE2-NEXT:    pinsrw $6, %eax, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
 ; SSE2-NEXT:    psllq $63, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
 ; SSE2-NEXT:    psllq $63, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_sext_4i1_to_4i64:
@@ -1807,12 +1807,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
 ; SSSE3-NEXT:    pinsrw $6, %eax, %xmm1
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
 ; SSSE3-NEXT:    psllq $63, %xmm0
-; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
 ; SSSE3-NEXT:    psllq $63, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_sext_4i1_to_4i64:
@@ -1837,12 +1837,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
 ; SSE41-NEXT:    movzbl %al, %eax
 ; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
 ; SSE41-NEXT:    psllq $63, %xmm0
-; SSE41-NEXT:    psrad $31, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    psrad $31, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
 ; SSE41-NEXT:    psllq $63, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: load_sext_4i1_to_4i64:
@@ -1941,12 +1941,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
 ; X86-SSE2-NEXT:    pinsrw $6, %eax, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
 ; X86-SSE2-NEXT:    psllq $63, %xmm0
-; X86-SSE2-NEXT:    psrad $31, %xmm0
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    psrad $31, %xmm0
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
 ; X86-SSE2-NEXT:    psllq $63, %xmm1
-; X86-SSE2-NEXT:    psrad $31, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    psrad $31, %xmm1
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE41-LABEL: load_sext_4i1_to_4i64:
@@ -1972,12 +1972,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
 ; X86-SSE41-NEXT:    movzbl %al, %eax
 ; X86-SSE41-NEXT:    pinsrb $12, %eax, %xmm1
 ; X86-SSE41-NEXT:    psllq $63, %xmm0
-; X86-SSE41-NEXT:    psrad $31, %xmm0
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    psrad $31, %xmm0
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
 ; X86-SSE41-NEXT:    psllq $63, %xmm1
-; X86-SSE41-NEXT:    psrad $31, %xmm1
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT:    psrad $31, %xmm1
 ; X86-SSE41-NEXT:    retl
 entry:
  %X = load <4 x i1>, ptr %ptr
@@ -3683,38 +3683,34 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
 ; SSE2-NEXT:    psllq $58, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    psrad $26, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; SSE2-NEXT:    psllq $58, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE2-NEXT:    psrad $26, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
 ; SSE2-NEXT:    psllq $58, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
 ; SSE2-NEXT:    psrad $26, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; SSE2-NEXT:    psllq $58, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
 ; SSE2-NEXT:    psrad $26, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; SSE2-NEXT:    retq
 ;
@@ -3727,38 +3723,34 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
 ; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
 ; SSSE3-NEXT:    psllq $58, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSSE3-NEXT:    psrad $26, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
 ; SSSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; SSSE3-NEXT:    psllq $58, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSSE3-NEXT:    psrad $26, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
 ; SSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
 ; SSSE3-NEXT:    psllq $58, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
 ; SSSE3-NEXT:    psrad $26, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
 ; SSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; SSSE3-NEXT:    psllq $58, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
 ; SSSE3-NEXT:    psrad $26, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -3768,36 +3760,32 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
 ; SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE41-NEXT:    psllq $58, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; SSE41-NEXT:    psllq $58, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    psrad $26, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
-; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT:    psllq $58, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psllq $58, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    psrad $31, %xmm2
 ; SSE41-NEXT:    psrad $26, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE41-NEXT:    psllq $58, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE41-NEXT:    psllq $58, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    psrad $31, %xmm4
 ; SSE41-NEXT:    psrad $26, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE41-NEXT:    psllq $58, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; SSE41-NEXT:    psllq $58, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    psrad $31, %xmm4
 ; SSE41-NEXT:    psrad $26, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
 ; SSE41-NEXT:    retq
 ;
@@ -3851,38 +3839,34 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
 ; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
 ; X86-SSE2-NEXT:    psllq $58, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    psrad $31, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    psrad $26, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
 ; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; X86-SSE2-NEXT:    psllq $58, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psrad $31, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; X86-SSE2-NEXT:    psrad $26, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
 ; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
 ; X86-SSE2-NEXT:    psllq $58, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    psrad $31, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
 ; X86-SSE2-NEXT:    psrad $26, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
 ; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; X86-SSE2-NEXT:    psllq $58, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    psrad $31, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
 ; X86-SSE2-NEXT:    psrad $26, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; X86-SSE2-NEXT:    retl
 ;
@@ -3892,36 +3876,32 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
 ; X86-SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
 ; X86-SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; X86-SSE41-NEXT:    psllq $58, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; X86-SSE41-NEXT:    psllq $58, %xmm1
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; X86-SSE41-NEXT:    psrad $31, %xmm1
 ; X86-SSE41-NEXT:    psrad $26, %xmm0
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
-; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; X86-SSE41-NEXT:    psllq $58, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X86-SSE41-NEXT:    psllq $58, %xmm2
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    psrad $31, %xmm2
 ; X86-SSE41-NEXT:    psrad $26, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; X86-SSE41-NEXT:    psllq $58, %xmm2
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; X86-SSE41-NEXT:    psllq $58, %xmm4
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; X86-SSE41-NEXT:    psrad $31, %xmm4
 ; X86-SSE41-NEXT:    psrad $26, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; X86-SSE41-NEXT:    psllq $58, %xmm3
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; X86-SSE41-NEXT:    psllq $58, %xmm4
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
 ; X86-SSE41-NEXT:    psrad $31, %xmm4
 ; X86-SSE41-NEXT:    psrad $26, %xmm3
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
 ; X86-SSE41-NEXT:    retl
 entry:

diff  --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 5b6aac249919..ed1910d341a0 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1385,9 +1385,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE2-NEXT:    movapd %xmm1, %xmm2
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    psraw $2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
 ; SSE2-NEXT:    movaps %xmm2, %xmm1
 ; SSE2-NEXT:    andps %xmm0, %xmm1
@@ -1456,9 +1456,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; X86-SSE-NEXT:    movapd %xmm1, %xmm2
 ; X86-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X86-SSE-NEXT:    psraw $2, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
 ; X86-SSE-NEXT:    movaps %xmm2, %xmm1
 ; X86-SSE-NEXT:    andps %xmm0, %xmm1
@@ -1571,9 +1571,8 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: splatconstant_shift_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 ; SSE2-NEXT:    psrad $7, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    psrlq $7, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1621,9 +1620,8 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ;
 ; X86-SSE-LABEL: splatconstant_shift_v2i64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 ; X86-SSE-NEXT:    psrad $7, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X86-SSE-NEXT:    psrlq $7, %xmm0
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]

diff  --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index f50ea52c948d..ec5cf43a357a 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1712,12 +1712,10 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
 define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $4, %xmm1
-; SSE2-NEXT:    psrad $5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    psrad $4, %xmm0
+; SSE2-NEXT:    psrad $5, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i32:
@@ -1762,12 +1760,10 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
 ;
 ; X86-SSE-LABEL: constant_shift_v2i32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrad $4, %xmm1
-; X86-SSE-NEXT:    psrad $5, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    psrad $4, %xmm0
+; X86-SSE-NEXT:    psrad $5, %xmm1
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE-NEXT:    retl
   %shift = ashr <2 x i32> %a, <i32 4, i32 5>
   ret <2 x i32> %shift

diff  --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index be9550095cd8..8d3cb45d396d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1418,12 +1418,10 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
 define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $4, %xmm1
-; SSE2-NEXT:    psrld $5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    psrld $4, %xmm0
+; SSE2-NEXT:    psrld $5, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i32:
@@ -1468,12 +1466,10 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
 ;
 ; X86-SSE-LABEL: constant_shift_v2i32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrld $4, %xmm1
-; X86-SSE-NEXT:    psrld $5, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    psrld $4, %xmm0
+; X86-SSE-NEXT:    psrld $5, %xmm1
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE-NEXT:    retl
   %shift = lshr <2 x i32> %a, <i32 4, i32 5>
   ret <2 x i32> %shift

diff  --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index c98954950ad2..e80a72d7e663 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -1258,12 +1258,10 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
 define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pslld $4, %xmm1
-; SSE2-NEXT:    pslld $5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    pslld $4, %xmm0
+; SSE2-NEXT:    pslld $5, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i32:
@@ -1308,12 +1306,10 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
 ;
 ; X86-SSE-LABEL: constant_shift_v2i32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    pslld $4, %xmm1
-; X86-SSE-NEXT:    pslld $5, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pslld $4, %xmm0
+; X86-SSE-NEXT:    pslld $5, %xmm1
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE-NEXT:    retl
   %shift = shl <2 x i32> %a, <i32 4, i32 5>
   ret <2 x i32> %shift

diff  --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
index 144a9e2e5439..1781196fc6f6 100644
--- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
@@ -3088,11 +3088,10 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %
 ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    psrad $31, %xmm0
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrad $31, %xmm1
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
@@ -3114,11 +3113,10 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %
 ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X64-SSE2-NEXT:    psrad $31, %xmm0
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrad $31, %xmm1
+; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
@@ -3143,53 +3141,51 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %
 define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
 ; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    psrad $31, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    psrad $1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpsrad $1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-AVX2-NEXT:    vpsrad $1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
 ; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    psrad $31, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X64-SSE2-NEXT:    psrad $1, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
 ; X64-AVX1:       # %bb.0:
 ; X64-AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33:
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpsrad $1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX2-NEXT:    vpsrad $1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; X64-AVX2-NEXT:    retq
   %t0 = and <2 x i64> %a0, <i64 18446744065119617024, i64 18446744065119617024>
@@ -3199,53 +3195,51 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %
 define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_34(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
 ; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    psrad $31, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    psrad $2, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpsrad $2, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-AVX2-NEXT:    vpsrad $2, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
 ; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    psrad $31, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X64-SSE2-NEXT:    psrad $2, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
 ; X64-AVX1:       # %bb.0:
 ; X64-AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34:
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpsrad $2, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX2-NEXT:    vpsrad $2, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; X64-AVX2-NEXT:    retq
   %t0 = and <2 x i64> %a0, <i64 18446744065119617024, i64 18446744065119617024>

diff  --git a/llvm/test/CodeGen/X86/viabs.ll b/llvm/test/CodeGen/X86/viabs.ll
index b051a7a07e6a..974af2c089a1 100644
--- a/llvm/test/CodeGen/X86/viabs.ll
+++ b/llvm/test/CodeGen/X86/viabs.ll
@@ -524,18 +524,16 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind {
 define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: test_abs_ge_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_ge_v2i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
 ; SSSE3-NEXT:    psubq %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
@@ -574,28 +572,24 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
 define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
 ; SSE2-LABEL: test_abs_gt_v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    psubq %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psubq %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_gt_v4i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm2, %xmm0
 ; SSSE3-NEXT:    psubq %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSSE3-NEXT:    psubq %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
@@ -643,48 +637,40 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
 define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
 ; SSE2-LABEL: test_abs_le_v8i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    psubq %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    psubq %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    psubq %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_le_v8i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm4, %xmm0
 ; SSSE3-NEXT:    psubq %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm4, %xmm1
 ; SSSE3-NEXT:    psubq %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSSE3-NEXT:    psubq %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm4, %xmm3
 ; SSSE3-NEXT:    psubq %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
@@ -751,24 +737,20 @@ define <8 x i64> @test_abs_le_v8i64_fold(ptr %a.ptr) nounwind {
 ; SSE2-NEXT:    movdqu 16(%rdi), %xmm1
 ; SSE2-NEXT:    movdqu 32(%rdi), %xmm2
 ; SSE2-NEXT:    movdqu 48(%rdi), %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    psubq %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    psubq %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    psubq %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    retq
@@ -779,24 +761,20 @@ define <8 x i64> @test_abs_le_v8i64_fold(ptr %a.ptr) nounwind {
 ; SSSE3-NEXT:    movdqu 16(%rdi), %xmm1
 ; SSSE3-NEXT:    movdqu 32(%rdi), %xmm2
 ; SSSE3-NEXT:    movdqu 48(%rdi), %xmm3
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm4, %xmm0
 ; SSSE3-NEXT:    psubq %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm4, %xmm1
 ; SSSE3-NEXT:    psubq %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSSE3-NEXT:    psubq %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
 ; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm4, %xmm3
 ; SSSE3-NEXT:    psubq %xmm4, %xmm3
 ; SSSE3-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll
index fefc5a2e9b5b..cc37f2c0c548 100644
--- a/llvm/test/CodeGen/X86/vselect-zero.ll
+++ b/llvm/test/CodeGen/X86/vselect-zero.ll
@@ -308,8 +308,8 @@ define <4 x i32> @signbit_mask_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: signbit_mask_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -343,8 +343,8 @@ define <2 x i64> @signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) {
 define <2 x i64> @signbit_mask_swap_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: signbit_mask_swap_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -521,11 +521,11 @@ define <8 x i32> @signbit_mask_swap_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <4 x i64> @signbit_mask_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE2-LABEL: signbit_mask_v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
@@ -673,8 +673,8 @@ define <4 x i32> @signbit_setmask_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @signbit_setmask_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: signbit_setmask_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -815,11 +815,11 @@ define <8 x i32> @signbit_setmask_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <4 x i64> @signbit_setmask_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE2-LABEL: signbit_setmask_v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
@@ -867,11 +867,11 @@ define <4 x i64> @signbit_setmask_v4i64(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i64> @signbit_setmask_swap_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE2-LABEL: signbit_setmask_swap_v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index c444b5054b0e..3fd74a253b0f 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -484,8 +484,8 @@ define <2 x i64> @shrunkblend_2uses(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b,
 ; SSE2-LABEL: shrunkblend_2uses:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    psllq $63, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    pandn %xmm2, %xmm5
 ; SSE2-NEXT:    pand %xmm0, %xmm1
@@ -523,8 +523,8 @@ define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i
 ; SSE2-LABEL: shrunkblend_nonvselectuse:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    psllq $63, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    psrad $31, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; SSE2-NEXT:    pandn %xmm2, %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm1
@@ -536,8 +536,8 @@ define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    psllq $63, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    psrad $31, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    psrad $31, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vshift-3.ll b/llvm/test/CodeGen/X86/vshift-3.ll
index 1a0aee85b4a9..8d472f00b488 100644
--- a/llvm/test/CodeGen/X86/vshift-3.ll
+++ b/llvm/test/CodeGen/X86/vshift-3.ll
@@ -11,20 +11,20 @@ define void @shift1a(<2 x i64> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift1a:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X86-NEXT:    psrad $31, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-NEXT:    movdqa %xmm1, (%eax)
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrad $31, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: shift1a:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X64-NEXT:    psrad $31, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT:    movdqa %xmm1, (%rdi)
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrad $31, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movdqa %xmm0, (%rdi)
 ; X64-NEXT:    retq
 entry:
   %ashr = ashr <2 x i64> %val, < i64 32, i64 32 >

diff  --git a/llvm/test/CodeGen/X86/vsplit-and.ll b/llvm/test/CodeGen/X86/vsplit-and.ll
index c2e4d350d4ca..85fc1447b9de 100644
--- a/llvm/test/CodeGen/X86/vsplit-and.ll
+++ b/llvm/test/CodeGen/X86/vsplit-and.ll
@@ -43,8 +43,8 @@ define void @t2(ptr %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {
 ; CHECK-NEXT:    andnps %xmm1, %xmm2
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; CHECK-NEXT:    psllq $63, %xmm0
-; CHECK-NEXT:    psrad $31, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    psrad $31, %xmm0
 ; CHECK-NEXT:    pmovsxdq %xmm2, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, (%rdi)
 ; CHECK-NEXT:    movq %xmm0, 16(%rdi)


        


More information about the llvm-commits mailing list