[llvm] ddfee6d - [X86] Support `X86ISD::PCMPEQ` and `X86ISD::PCMPGT` in ComputeKnownBits

Noah Goldstein via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 26 21:48:59 PDT 2023


Author: Noah Goldstein
Date: 2023-04-26T23:48:24-05:00
New Revision: ddfee6d0b6979fc6e61fa5ac7424096c358746fb

URL: https://github.com/llvm/llvm-project/commit/ddfee6d0b6979fc6e61fa5ac7424096c358746fb
DIFF: https://github.com/llvm/llvm-project/commit/ddfee6d0b6979fc6e61fa5ac7424096c358746fb.diff

LOG: [X86] Support `X86ISD::PCMPEQ` and `X86ISD::PCMPGT` in ComputeKnownBits

These functions where missing support but are used enough that it
makes sense to track them.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D148963

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/combine-srl.ll
    llvm/test/CodeGen/X86/fpclamptosat_vec.ll
    llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
    llvm/test/CodeGen/X86/psubus.ll
    llvm/test/CodeGen/X86/sat-add.ll
    llvm/test/CodeGen/X86/vec_setcc-2.ll
    llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
    llvm/test/CodeGen/X86/vector-trunc-usat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9e776e31a806a..2db70d270f5f7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38595,6 +38595,23 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known.Zero.setBitsFrom(16);
     break;
   }
+  case X86ISD::PCMPGT:
+  case X86ISD::PCMPEQ: {
+    KnownBits KnownLhs =
+        DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    KnownBits KnownRhs =
+        DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    std::optional<bool> Res = Opc == X86ISD::PCMPEQ
+                                  ? KnownBits::eq(KnownLhs, KnownRhs)
+                                  : KnownBits::sgt(KnownLhs, KnownRhs);
+    if (Res) {
+      if (*Res)
+        Known.setAllOnes();
+      else
+        Known.setAllZero();
+    }
+    break;
+  }
   case X86ISD::PMULUDQ: {
     KnownBits Known2;
     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

diff  --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 7bd2683286ab7..d2704e8f7af3a 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -346,23 +346,18 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pshufb %xmm0, %xmm2
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrlw $4, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pshufb %xmm3, %xmm1
-; SSE-NEXT:    pcmpeqb %xmm4, %xmm3
-; SSE-NEXT:    pand %xmm2, %xmm3
-; SSE-NEXT:    paddb %xmm1, %xmm3
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pcmpeqb %xmm4, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm1
-; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm3
-; SSE-NEXT:    paddw %xmm1, %xmm3
-; SSE-NEXT:    pcmpeqw %xmm4, %xmm0
+; SSE-NEXT:    psrlw $4, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pshufb %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqb %xmm3, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    paddb %xmm1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
 ; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    psrld $16, %xmm3
 ; SSE-NEXT:    paddd %xmm3, %xmm0
 ; SSE-NEXT:    psrld $5, %xmm0
 ; SSE-NEXT:    retq
@@ -372,22 +367,18 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
 ; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
-; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
-; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
-; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
-; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm4
+; AVX-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
 ; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = and <4 x i32> %x, <i32 4, i32 32, i32 64, i32 128>

diff  --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index c351c1b82cf19..37986f647804b 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -76,18 +76,16 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) {
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
 ; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455]
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-NEXT:    pcmpgtd %xmm2, %xmm3
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; CHECK-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    pand %xmm3, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm1
-; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-NEXT:    por %xmm1, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; CHECK-NEXT:    pand %xmm0, %xmm1
+; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    retq
 entry:
   %conv = fptoui <2 x double> %x to <2 x i64>
@@ -276,31 +274,27 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) {
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm4
 ; CHECK-NEXT:    pxor %xmm3, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm5
-; CHECK-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647]
 ; CHECK-NEXT:    movdqa %xmm6, %xmm7
-; CHECK-NEXT:    pcmpgtd %xmm4, %xmm7
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; CHECK-NEXT:    pand %xmm5, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; CHECK-NEXT:    por %xmm4, %xmm5
-; CHECK-NEXT:    pand %xmm5, %xmm0
-; CHECK-NEXT:    pandn %xmm2, %xmm5
-; CHECK-NEXT:    por %xmm0, %xmm5
+; CHECK-NEXT:    pcmpgtd %xmm5, %xmm7
+; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm3, %xmm4
+; CHECK-NEXT:    pand %xmm7, %xmm4
+; CHECK-NEXT:    pand %xmm4, %xmm0
+; CHECK-NEXT:    pandn %xmm2, %xmm4
+; CHECK-NEXT:    por %xmm0, %xmm4
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    pxor %xmm3, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm4
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; CHECK-NEXT:    por %xmm3, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; CHECK-NEXT:    pcmpgtd %xmm5, %xmm6
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-NEXT:    pand %xmm6, %xmm0
 ; CHECK-NEXT:    pand %xmm0, %xmm1
 ; CHECK-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
 ; CHECK-NEXT:    retq
 entry:
   %conv = fptoui <4 x float> %x to <4 x i64>
@@ -560,33 +554,28 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-NEXT:    pxor %xmm2, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
 ; CHECK-NEXT:    movdqa %xmm5, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    pand %xmm4, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm4
-; CHECK-NEXT:    por %xmm0, %xmm4
+; CHECK-NEXT:    pcmpgtd %xmm4, %xmm6
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm3
+; CHECK-NEXT:    pand %xmm6, %xmm3
+; CHECK-NEXT:    pand %xmm3, %xmm0
+; CHECK-NEXT:    pandn %xmm1, %xmm3
+; CHECK-NEXT:    por %xmm0, %xmm3
 ; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; CHECK-NEXT:    movdqa %xmm6, %xmm0
 ; CHECK-NEXT:    pxor %xmm2, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm3
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2]
-; CHECK-NEXT:    pand %xmm3, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    movdqa %xmm6, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; CHECK-NEXT:    pcmpgtd %xmm4, %xmm5
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm0
+; CHECK-NEXT:    pand %xmm5, %xmm0
+; CHECK-NEXT:    pand %xmm0, %xmm6
 ; CHECK-NEXT:    pandn %xmm1, %xmm0
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; CHECK-NEXT:    por %xmm6, %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
 ; CHECK-NEXT:    addq $72, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -1661,18 +1650,16 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) {
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
 ; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455]
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-NEXT:    pcmpgtd %xmm2, %xmm3
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; CHECK-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    pand %xmm3, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm1
-; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-NEXT:    por %xmm1, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; CHECK-NEXT:    pand %xmm0, %xmm1
+; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    retq
 entry:
   %conv = fptoui <2 x double> %x to <2 x i64>
@@ -1855,32 +1842,28 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-NEXT:    pxor %xmm2, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
 ; CHECK-NEXT:    movdqa %xmm5, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; CHECK-NEXT:    pand %xmm4, %xmm0
-; CHECK-NEXT:    pandn %xmm3, %xmm4
-; CHECK-NEXT:    por %xmm0, %xmm4
+; CHECK-NEXT:    pcmpgtd %xmm4, %xmm6
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm3
+; CHECK-NEXT:    pand %xmm6, %xmm3
+; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
+; CHECK-NEXT:    pand %xmm3, %xmm0
+; CHECK-NEXT:    pandn %xmm4, %xmm3
+; CHECK-NEXT:    por %xmm0, %xmm3
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    pxor %xmm2, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2]
-; CHECK-NEXT:    pand %xmm6, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; CHECK-NEXT:    pcmpgtd %xmm6, %xmm5
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm0
+; CHECK-NEXT:    pand %xmm5, %xmm0
 ; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    pandn %xmm3, %xmm0
+; CHECK-NEXT:    pandn %xmm4, %xmm0
 ; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
 ; CHECK-NEXT:    retq
 entry:
   %conv = fptoui <4 x float> %x to <4 x i64>
@@ -2134,34 +2117,29 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
 ; CHECK-NEXT:    pxor %xmm1, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
 ; CHECK-NEXT:    movdqa %xmm4, %xmm5
-; CHECK-NEXT:    pcmpgtd %xmm2, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2]
-; CHECK-NEXT:    pand %xmm3, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; CHECK-NEXT:    pand %xmm3, %xmm0
-; CHECK-NEXT:    pandn %xmm2, %xmm3
-; CHECK-NEXT:    por %xmm0, %xmm3
+; CHECK-NEXT:    pcmpgtd %xmm3, %xmm5
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm2
+; CHECK-NEXT:    pand %xmm5, %xmm2
+; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; CHECK-NEXT:    pand %xmm2, %xmm0
+; CHECK-NEXT:    pandn %xmm3, %xmm2
+; CHECK-NEXT:    por %xmm0, %xmm2
 ; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; CHECK-NEXT:    movdqa %xmm6, %xmm0
 ; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm5
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; CHECK-NEXT:    pand %xmm5, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm6, %xmm1
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    pandn %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; CHECK-NEXT:    pcmpgtd %xmm5, %xmm4
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT:    pand %xmm4, %xmm0
+; CHECK-NEXT:    pand %xmm0, %xmm6
+; CHECK-NEXT:    pandn %xmm3, %xmm0
+; CHECK-NEXT:    por %xmm6, %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
 ; CHECK-NEXT:    addq $72, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index fba710e839b22..c15e73daceb9b 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -11,59 +11,51 @@
 define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-LABEL: truncstore_v8i64_v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm8
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    pxor %xmm8, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm11
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT:    movdqa %xmm9, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2]
-; SSE2-NEXT:    pand %xmm11, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm11
-; SSE2-NEXT:    pand %xmm11, %xmm1
-; SSE2-NEXT:    pandn %xmm6, %xmm11
-; SSE2-NEXT:    por %xmm1, %xmm11
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm1, %xmm9
+; SSE2-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    movdqa %xmm7, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm9
+; SSE2-NEXT:    pand %xmm12, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm1, %xmm9
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
-; SSE2-NEXT:    movdqa %xmm9, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT:    pand %xmm10, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm13, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm7, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT:    pand %xmm12, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    pandn %xmm6, %xmm1
 ; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
-; SSE2-NEXT:    movdqa %xmm9, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
-; SSE2-NEXT:    pand %xmm10, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm11
-; SSE2-NEXT:    pxor %xmm8, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm11, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm9[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm11, %xmm11
-; SSE2-NEXT:    pxor %xmm11, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm4
-; SSE2-NEXT:    pxor %xmm11, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm9
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm10
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
@@ -72,8 +64,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:  # %bb.1: # %cond.store
 ; SSE2-NEXT:    movss %xmm1, (%rdi)
 ; SSE2-NEXT:  .LBB0_2: # %else
-; SSE2-NEXT:    por %xmm10, %xmm0
-; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm7, %xmm9
 ; SSE2-NEXT:    testb $2, %al
 ; SSE2-NEXT:    je .LBB0_4
 ; SSE2-NEXT:  # %bb.3: # %cond.store1
@@ -82,8 +74,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:  .LBB0_4: # %else2
 ; SSE2-NEXT:    pand %xmm0, %xmm3
 ; SSE2-NEXT:    pandn %xmm6, %xmm0
-; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    pandn %xmm6, %xmm8
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
 ; SSE2-NEXT:    testb $4, %al
 ; SSE2-NEXT:    je .LBB0_6
 ; SSE2-NEXT:  # %bb.5: # %cond.store3
@@ -91,7 +83,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    movd %xmm4, 8(%rdi)
 ; SSE2-NEXT:  .LBB0_6: # %else4
 ; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm9, %xmm2
 ; SSE2-NEXT:    testb $8, %al
 ; SSE2-NEXT:    je .LBB0_8
 ; SSE2-NEXT:  # %bb.7: # %cond.store5
@@ -319,68 +311,60 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-LABEL: truncstore_v8i64_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    movdqa %xmm2, %xmm8
-; SSE2-NEXT:    pxor %xmm9, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm11
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pxor %xmm9, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183]
 ; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT:    pand %xmm11, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm13, %xmm8
-; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    pandn %xmm7, %xmm8
-; SSE2-NEXT:    por %xmm2, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSE2-NEXT:    pand %xmm12, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm2, %xmm7
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    pxor %xmm9, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT:    pand %xmm11, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm13, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm12, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pandn %xmm7, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm2
 ; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    pxor %xmm9, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2]
-; SSE2-NEXT:    pand %xmm11, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm11
-; SSE2-NEXT:    pand %xmm11, %xmm0
-; SSE2-NEXT:    pandn %xmm7, %xmm11
-; SSE2-NEXT:    por %xmm0, %xmm11
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm9, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pand %xmm12, %xmm3
 ; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pandn %xmm8, %xmm3
 ; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pandn %xmm7, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
@@ -817,58 +801,50 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm10
 ; SSE2-NEXT:    pxor %xmm8, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm11
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483903,2147483903,2147483903,2147483903]
 ; SSE2-NEXT:    movdqa %xmm9, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2]
-; SSE2-NEXT:    pand %xmm11, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm11
-; SSE2-NEXT:    pand %xmm11, %xmm1
-; SSE2-NEXT:    pandn %xmm7, %xmm11
-; SSE2-NEXT:    por %xmm1, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT:    pand %xmm12, %xmm10
+; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pandn %xmm7, %xmm10
+; SSE2-NEXT:    por %xmm1, %xmm10
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm9, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT:    pand %xmm10, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm13, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm1
+; SSE2-NEXT:    pand %xmm12, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    pandn %xmm7, %xmm1
 ; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm11, %xmm1
+; SSE2-NEXT:    packuswb %xmm10, %xmm1
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm9, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
-; SSE2-NEXT:    pand %xmm10, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm10
-; SSE2-NEXT:    pand %xmm10, %xmm3
-; SSE2-NEXT:    pandn %xmm7, %xmm10
-; SSE2-NEXT:    por %xmm3, %xmm10
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm9, %xmm3
 ; SSE2-NEXT:    pand %xmm3, %xmm2
 ; SSE2-NEXT:    pandn %xmm7, %xmm3
 ; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    packuswb %xmm10, %xmm3
+; SSE2-NEXT:    packuswb %xmm0, %xmm3
 ; SSE2-NEXT:    packuswb %xmm3, %xmm1
 ; SSE2-NEXT:    packuswb %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
@@ -1300,31 +1276,27 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647]
 ; SSE2-NEXT:    movdqa %xmm8, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm7
-; SSE2-NEXT:    pand %xmm7, %xmm1
-; SSE2-NEXT:    pandn %xmm4, %xmm7
-; SSE2-NEXT:    por %xmm1, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm6
+; SSE2-NEXT:    por %xmm1, %xmm6
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm5, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    pandn %xmm4, %xmm1
 ; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    movmskps %xmm3, %eax
 ; SSE2-NEXT:    xorl $15, %eax
@@ -1496,33 +1468,29 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
 ; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183]
 ; SSE2-NEXT:    movdqa %xmm8, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm7
-; SSE2-NEXT:    pand %xmm7, %xmm0
-; SSE2-NEXT:    pandn %xmm4, %xmm7
-; SSE2-NEXT:    por %xmm0, %xmm7
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE2-NEXT:    pand %xmm9, %xmm6
 ; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pandn %xmm4, %xmm5
-; SSE2-NEXT:    por %xmm1, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT:    pandn %xmm4, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
@@ -1780,33 +1748,29 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    pxor %xmm6, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483903,2147483903,2147483903,2147483903]
 ; SSE2-NEXT:    movdqa %xmm8, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm9, %xmm4
 ; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    pandn %xmm5, %xmm4
 ; SSE2-NEXT:    por %xmm0, %xmm4
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm6, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm6, %xmm1
-; SSE2-NEXT:    pandn %xmm5, %xmm6
-; SSE2-NEXT:    por %xmm1, %xmm6
-; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    packuswb %xmm6, %xmm4
+; SSE2-NEXT:    packuswb %xmm0, %xmm4
 ; SSE2-NEXT:    packuswb %xmm4, %xmm4
 ; SSE2-NEXT:    packuswb %xmm4, %xmm4
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
@@ -2068,18 +2032,16 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    pxor %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    pand %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
@@ -2206,18 +2168,16 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    pxor %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991]
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    pand %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
@@ -2355,28 +2315,26 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    pxor %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    pand %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    packuswb %xmm4, %xmm4
-; SSE2-NEXT:    packuswb %xmm4, %xmm4
-; SSE2-NEXT:    packuswb %xmm4, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    packuswb %xmm3, %xmm3
+; SSE2-NEXT:    packuswb %xmm3, %xmm3
+; SSE2-NEXT:    packuswb %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    movmskpd %xmm0, %eax
 ; SSE2-NEXT:    xorl $3, %eax
 ; SSE2-NEXT:    testb $1, %al
-; SSE2-NEXT:    movd %xmm4, %ecx
+; SSE2-NEXT:    movd %xmm3, %ecx
 ; SSE2-NEXT:    jne .LBB8_1
 ; SSE2-NEXT:  # %bb.2: # %else
 ; SSE2-NEXT:    testb $2, %al

diff  --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 1afadf814c76b..9b25707fcc8be 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -1611,15 +1611,13 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
 ; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
 ; SSE2OR3-NEXT:    movdqa %xmm2, %xmm7
 ; SSE2OR3-NEXT:    pxor %xmm5, %xmm7
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2OR3-NEXT:    movdqa %xmm6, %xmm9
+; SSE2OR3-NEXT:    pcmpgtd %xmm8, %xmm9
 ; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
 ; SSE2OR3-NEXT:    pcmpeqd %xmm5, %xmm8
-; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
-; SSE2OR3-NEXT:    movdqa %xmm6, %xmm9
-; SSE2OR3-NEXT:    pcmpgtd %xmm7, %xmm9
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm8, %xmm7
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm7, %xmm8
+; SSE2OR3-NEXT:    pand %xmm9, %xmm8
 ; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535]
 ; SSE2OR3-NEXT:    pand %xmm8, %xmm2
 ; SSE2OR3-NEXT:    pandn %xmm7, %xmm8
@@ -1628,48 +1626,42 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
 ; SSE2OR3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
 ; SSE2OR3-NEXT:    movdqa %xmm1, %xmm8
 ; SSE2OR3-NEXT:    pxor %xmm5, %xmm8
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
-; SSE2OR3-NEXT:    pcmpeqd %xmm5, %xmm9
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
 ; SSE2OR3-NEXT:    movdqa %xmm6, %xmm10
-; SSE2OR3-NEXT:    pcmpgtd %xmm8, %xmm10
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm9, %xmm8
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm8, %xmm9
-; SSE2OR3-NEXT:    pand %xmm9, %xmm1
-; SSE2OR3-NEXT:    pandn %xmm7, %xmm9
-; SSE2OR3-NEXT:    por %xmm1, %xmm9
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
+; SSE2OR3-NEXT:    pcmpgtd %xmm9, %xmm10
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2OR3-NEXT:    pcmpeqd %xmm5, %xmm8
+; SSE2OR3-NEXT:    pand %xmm10, %xmm8
+; SSE2OR3-NEXT:    pand %xmm8, %xmm1
+; SSE2OR3-NEXT:    pandn %xmm7, %xmm8
+; SSE2OR3-NEXT:    por %xmm1, %xmm8
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3]
 ; SSE2OR3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE2OR3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2OR3-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2OR3-NEXT:    pxor %xmm5, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT:    pcmpeqd %xmm5, %xmm8
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
 ; SSE2OR3-NEXT:    movdqa %xmm6, %xmm9
-; SSE2OR3-NEXT:    pcmpgtd %xmm2, %xmm9
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm8, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm2, %xmm8
-; SSE2OR3-NEXT:    pand %xmm8, %xmm4
-; SSE2OR3-NEXT:    pandn %xmm7, %xmm8
-; SSE2OR3-NEXT:    por %xmm4, %xmm8
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
+; SSE2OR3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2OR3-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE2OR3-NEXT:    pand %xmm9, %xmm2
+; SSE2OR3-NEXT:    pand %xmm2, %xmm4
+; SSE2OR3-NEXT:    pandn %xmm7, %xmm2
+; SSE2OR3-NEXT:    por %xmm4, %xmm2
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE2OR3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
 ; SSE2OR3-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2OR3-NEXT:    pxor %xmm5, %xmm4
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
-; SSE2OR3-NEXT:    pcmpeqd %xmm5, %xmm8
-; SSE2OR3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm8, %xmm4
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm4, %xmm5
-; SSE2OR3-NEXT:    pand %xmm5, %xmm3
-; SSE2OR3-NEXT:    pandn %xmm7, %xmm5
-; SSE2OR3-NEXT:    por %xmm3, %xmm5
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE2OR3-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2OR3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2OR3-NEXT:    pand %xmm6, %xmm4
+; SSE2OR3-NEXT:    pand %xmm4, %xmm3
+; SSE2OR3-NEXT:    pandn %xmm7, %xmm4
+; SSE2OR3-NEXT:    por %xmm3, %xmm4
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
 ; SSE2OR3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
 ; SSE2OR3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; SSE2OR3-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
@@ -1678,56 +1670,53 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
 ;
 ; SSE41-LABEL: psubus_8i64_max:
 ; SSE41:       # %bb.0: # %vector.ph
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pxor %xmm8, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT:    movdqa %xmm6, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT:    movdqa %xmm6, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE41-NEXT:    pand %xmm7, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [65535,65535]
-; SSE41-NEXT:    movapd %xmm7, %xmm9
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm9
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT:    movdqa %xmm7, %xmm8
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [65535,65535]
+; SSE41-NEXT:    movapd %xmm8, %xmm10
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm10
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm8, %xmm0
-; SSE41-NEXT:    movdqa %xmm6, %xmm4
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm7, %xmm4
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    movdqa %xmm6, %xmm10
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm0
 ; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    por %xmm10, %xmm0
-; SSE41-NEXT:    movapd %xmm7, %xmm4
+; SSE41-NEXT:    movapd %xmm8, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
-; SSE41-NEXT:    packusdw %xmm9, %xmm4
+; SSE41-NEXT:    packusdw %xmm10, %xmm4
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm8, %xmm0
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm7, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm6, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
 ; SSE41-NEXT:    pand %xmm3, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm7, %xmm3
+; SSE41-NEXT:    movapd %xmm8, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT:    pxor %xmm1, %xmm8
-; SSE41-NEXT:    movdqa %xmm6, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    por %xmm6, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
-; SSE41-NEXT:    packusdw %xmm3, %xmm7
-; SSE41-NEXT:    packusdw %xmm4, %xmm7
-; SSE41-NEXT:    psubusw %xmm7, %xmm5
+; SSE41-NEXT:    pxor %xmm1, %xmm9
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pand %xmm7, %xmm5
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm8
+; SSE41-NEXT:    packusdw %xmm3, %xmm8
+; SSE41-NEXT:    packusdw %xmm4, %xmm8
+; SSE41-NEXT:    psubusw %xmm8, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: psubus_8i64_max:
@@ -2675,127 +2664,116 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) {
 ; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
 ; SSE2OR3-NEXT:    movdqa %xmm3, %xmm8
 ; SSE2OR3-NEXT:    pxor %xmm6, %xmm8
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2OR3-NEXT:    movdqa %xmm7, %xmm10
+; SSE2OR3-NEXT:    pcmpgtd %xmm9, %xmm10
 ; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
 ; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm9
-; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455]
-; SSE2OR3-NEXT:    movdqa %xmm7, %xmm10
-; SSE2OR3-NEXT:    pcmpgtd %xmm8, %xmm10
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm9, %xmm8
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm8, %xmm9
+; SSE2OR3-NEXT:    pand %xmm10, %xmm9
 ; SSE2OR3-NEXT:    pcmpeqd %xmm8, %xmm8
 ; SSE2OR3-NEXT:    pand %xmm9, %xmm3
 ; SSE2OR3-NEXT:    pxor %xmm8, %xmm9
 ; SSE2OR3-NEXT:    por %xmm3, %xmm9
 ; SSE2OR3-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2OR3-NEXT:    pxor %xmm6, %xmm3
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3]
-; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm10
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
 ; SSE2OR3-NEXT:    movdqa %xmm7, %xmm11
-; SSE2OR3-NEXT:    pcmpgtd %xmm3, %xmm11
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm10, %xmm3
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm3, %xmm10
-; SSE2OR3-NEXT:    pand %xmm10, %xmm2
-; SSE2OR3-NEXT:    pxor %xmm8, %xmm10
-; SSE2OR3-NEXT:    por %xmm2, %xmm10
-; SSE2OR3-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,2]
+; SSE2OR3-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm3
+; SSE2OR3-NEXT:    pand %xmm11, %xmm3
+; SSE2OR3-NEXT:    pand %xmm3, %xmm2
+; SSE2OR3-NEXT:    pxor %xmm8, %xmm3
+; SSE2OR3-NEXT:    por %xmm2, %xmm3
+; SSE2OR3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2]
 ; SSE2OR3-NEXT:    movdqa %xmm0, %xmm2
-; SSE2OR3-NEXT:    psubd %xmm10, %xmm2
-; SSE2OR3-NEXT:    pxor %xmm6, %xmm10
+; SSE2OR3-NEXT:    psubd %xmm3, %xmm2
+; SSE2OR3-NEXT:    pxor %xmm6, %xmm3
 ; SSE2OR3-NEXT:    pxor %xmm6, %xmm0
-; SSE2OR3-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE2OR3-NEXT:    pcmpgtd %xmm3, %xmm0
 ; SSE2OR3-NEXT:    pand %xmm2, %xmm0
 ; SSE2OR3-NEXT:    movdqa %xmm5, %xmm2
 ; SSE2OR3-NEXT:    pxor %xmm6, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm3
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE2OR3-NEXT:    movdqa %xmm7, %xmm9
-; SSE2OR3-NEXT:    pcmpgtd %xmm2, %xmm9
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm3, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm2, %xmm3
-; SSE2OR3-NEXT:    pand %xmm3, %xmm5
-; SSE2OR3-NEXT:    pxor %xmm8, %xmm3
-; SSE2OR3-NEXT:    por %xmm5, %xmm3
-; SSE2OR3-NEXT:    movdqa %xmm4, %xmm2
-; SSE2OR3-NEXT:    pxor %xmm6, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSE2OR3-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm5, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm2, %xmm5
-; SSE2OR3-NEXT:    pxor %xmm5, %xmm8
-; SSE2OR3-NEXT:    pand %xmm4, %xmm5
-; SSE2OR3-NEXT:    por %xmm8, %xmm5
-; SSE2OR3-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2]
+; SSE2OR3-NEXT:    pcmpgtd %xmm3, %xmm9
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm2
+; SSE2OR3-NEXT:    pand %xmm9, %xmm2
+; SSE2OR3-NEXT:    pand %xmm2, %xmm5
+; SSE2OR3-NEXT:    pxor %xmm8, %xmm2
+; SSE2OR3-NEXT:    por %xmm5, %xmm2
+; SSE2OR3-NEXT:    movdqa %xmm4, %xmm3
+; SSE2OR3-NEXT:    pxor %xmm6, %xmm3
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2OR3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm3
+; SSE2OR3-NEXT:    pand %xmm7, %xmm3
+; SSE2OR3-NEXT:    pxor %xmm3, %xmm8
+; SSE2OR3-NEXT:    pand %xmm4, %xmm3
+; SSE2OR3-NEXT:    por %xmm8, %xmm3
+; SSE2OR3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
 ; SSE2OR3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2OR3-NEXT:    psubd %xmm5, %xmm2
-; SSE2OR3-NEXT:    pxor %xmm6, %xmm5
+; SSE2OR3-NEXT:    psubd %xmm3, %xmm2
+; SSE2OR3-NEXT:    pxor %xmm6, %xmm3
 ; SSE2OR3-NEXT:    pxor %xmm6, %xmm1
-; SSE2OR3-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2OR3-NEXT:    pcmpgtd %xmm3, %xmm1
 ; SSE2OR3-NEXT:    pand %xmm2, %xmm1
 ; SSE2OR3-NEXT:    retq
 ;
 ; SSE41-LABEL: test33:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm6
-; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT:    movdqa %xmm7, %xmm8
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT:    movdqa %xmm7, %xmm10
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
-; SSE41-NEXT:    pand %xmm8, %xmm0
-; SSE41-NEXT:    por %xmm10, %xmm0
-; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE41-NEXT:    movapd %xmm8, %xmm10
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm10
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT:    movdqa %xmm8, %xmm9
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm0
+; SSE41-NEXT:    pand %xmm9, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm9 = [4294967295,4294967295]
+; SSE41-NEXT:    movapd %xmm9, %xmm11
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm11
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm7, %xmm3
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm7, %xmm11
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm11
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm0
 ; SSE41-NEXT:    pand %xmm3, %xmm0
-; SSE41-NEXT:    por %xmm11, %xmm0
-; SSE41-NEXT:    movapd %xmm8, %xmm3
+; SSE41-NEXT:    movapd %xmm9, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
-; SSE41-NEXT:    pmaxud %xmm3, %xmm6
-; SSE41-NEXT:    psubd %xmm3, %xmm6
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2]
+; SSE41-NEXT:    pmaxud %xmm3, %xmm7
+; SSE41-NEXT:    psubd %xmm3, %xmm7
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm7, %xmm2
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm8, %xmm2
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm7, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
 ; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    movapd %xmm8, %xmm2
+; SSE41-NEXT:    movapd %xmm9, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm9
-; SSE41-NEXT:    movdqa %xmm7, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pand %xmm3, %xmm0
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
-; SSE41-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2]
-; SSE41-NEXT:    pmaxud %xmm8, %xmm1
-; SSE41-NEXT:    psubd %xmm8, %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm10
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm8
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pand %xmm8, %xmm6
 ; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm9
+; SSE41-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2]
+; SSE41-NEXT:    pmaxud %xmm9, %xmm1
+; SSE41-NEXT:    psubd %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test33:
@@ -2906,70 +2884,62 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
 ; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
 ; SSE2OR3-NEXT:    movdqa %xmm3, %xmm8
 ; SSE2OR3-NEXT:    pxor %xmm6, %xmm8
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2OR3-NEXT:    movdqa %xmm7, %xmm10
+; SSE2OR3-NEXT:    pcmpgtd %xmm9, %xmm10
 ; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
 ; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm9
-; SSE2OR3-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455]
-; SSE2OR3-NEXT:    movdqa %xmm7, %xmm10
-; SSE2OR3-NEXT:    pcmpgtd %xmm8, %xmm10
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm9, %xmm8
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm8, %xmm9
+; SSE2OR3-NEXT:    pand %xmm10, %xmm9
 ; SSE2OR3-NEXT:    pcmpeqd %xmm8, %xmm8
 ; SSE2OR3-NEXT:    pand %xmm9, %xmm3
 ; SSE2OR3-NEXT:    pxor %xmm8, %xmm9
 ; SSE2OR3-NEXT:    por %xmm3, %xmm9
 ; SSE2OR3-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2OR3-NEXT:    pxor %xmm6, %xmm3
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3]
-; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm10
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
 ; SSE2OR3-NEXT:    movdqa %xmm7, %xmm11
-; SSE2OR3-NEXT:    pcmpgtd %xmm3, %xmm11
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm10, %xmm3
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm3, %xmm10
-; SSE2OR3-NEXT:    pand %xmm10, %xmm2
-; SSE2OR3-NEXT:    pxor %xmm8, %xmm10
-; SSE2OR3-NEXT:    por %xmm2, %xmm10
-; SSE2OR3-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,2]
+; SSE2OR3-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm3
+; SSE2OR3-NEXT:    pand %xmm11, %xmm3
+; SSE2OR3-NEXT:    pand %xmm3, %xmm2
+; SSE2OR3-NEXT:    pxor %xmm8, %xmm3
+; SSE2OR3-NEXT:    por %xmm2, %xmm3
+; SSE2OR3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2]
 ; SSE2OR3-NEXT:    movdqa %xmm0, %xmm2
-; SSE2OR3-NEXT:    psubd %xmm10, %xmm2
-; SSE2OR3-NEXT:    pxor %xmm6, %xmm10
+; SSE2OR3-NEXT:    psubd %xmm3, %xmm2
+; SSE2OR3-NEXT:    pxor %xmm6, %xmm3
 ; SSE2OR3-NEXT:    por %xmm6, %xmm0
-; SSE2OR3-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE2OR3-NEXT:    pcmpgtd %xmm3, %xmm0
 ; SSE2OR3-NEXT:    pand %xmm2, %xmm0
 ; SSE2OR3-NEXT:    movdqa %xmm5, %xmm2
 ; SSE2OR3-NEXT:    pxor %xmm6, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm3
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE2OR3-NEXT:    movdqa %xmm7, %xmm9
-; SSE2OR3-NEXT:    pcmpgtd %xmm2, %xmm9
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm3, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm2, %xmm3
-; SSE2OR3-NEXT:    pand %xmm3, %xmm5
-; SSE2OR3-NEXT:    pxor %xmm8, %xmm3
-; SSE2OR3-NEXT:    por %xmm5, %xmm3
-; SSE2OR3-NEXT:    movdqa %xmm4, %xmm2
-; SSE2OR3-NEXT:    pxor %xmm6, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSE2OR3-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
-; SSE2OR3-NEXT:    pand %xmm5, %xmm2
-; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2OR3-NEXT:    por %xmm2, %xmm5
-; SSE2OR3-NEXT:    pxor %xmm5, %xmm8
-; SSE2OR3-NEXT:    pand %xmm4, %xmm5
-; SSE2OR3-NEXT:    por %xmm8, %xmm5
-; SSE2OR3-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2]
+; SSE2OR3-NEXT:    pcmpgtd %xmm3, %xmm9
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm2
+; SSE2OR3-NEXT:    pand %xmm9, %xmm2
+; SSE2OR3-NEXT:    pand %xmm2, %xmm5
+; SSE2OR3-NEXT:    pxor %xmm8, %xmm2
+; SSE2OR3-NEXT:    por %xmm5, %xmm2
+; SSE2OR3-NEXT:    movdqa %xmm4, %xmm3
+; SSE2OR3-NEXT:    pxor %xmm6, %xmm3
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2OR3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2OR3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2OR3-NEXT:    pcmpeqd %xmm6, %xmm3
+; SSE2OR3-NEXT:    pand %xmm7, %xmm3
+; SSE2OR3-NEXT:    pxor %xmm3, %xmm8
+; SSE2OR3-NEXT:    pand %xmm4, %xmm3
+; SSE2OR3-NEXT:    por %xmm8, %xmm3
+; SSE2OR3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
 ; SSE2OR3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2OR3-NEXT:    psubd %xmm5, %xmm2
-; SSE2OR3-NEXT:    pxor %xmm6, %xmm5
+; SSE2OR3-NEXT:    psubd %xmm3, %xmm2
+; SSE2OR3-NEXT:    pxor %xmm6, %xmm3
 ; SSE2OR3-NEXT:    por %xmm6, %xmm1
-; SSE2OR3-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2OR3-NEXT:    pcmpgtd %xmm3, %xmm1
 ; SSE2OR3-NEXT:    pand %xmm2, %xmm1
 ; SSE2OR3-NEXT:    retq
 ;
@@ -2979,56 +2949,53 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
 ; SSE41-NEXT:    pand %xmm0, %xmm1
 ; SSE41-NEXT:    pand %xmm0, %xmm6
-; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT:    movdqa %xmm7, %xmm8
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT:    movdqa %xmm7, %xmm10
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
-; SSE41-NEXT:    pand %xmm8, %xmm0
-; SSE41-NEXT:    por %xmm10, %xmm0
-; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE41-NEXT:    movapd %xmm8, %xmm10
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm10
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT:    movdqa %xmm8, %xmm9
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm0
+; SSE41-NEXT:    pand %xmm9, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm9 = [4294967295,4294967295]
+; SSE41-NEXT:    movapd %xmm9, %xmm11
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm11
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm7, %xmm3
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm7, %xmm11
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm11
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm0
 ; SSE41-NEXT:    pand %xmm3, %xmm0
-; SSE41-NEXT:    por %xmm11, %xmm0
-; SSE41-NEXT:    movapd %xmm8, %xmm3
+; SSE41-NEXT:    movapd %xmm9, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2]
 ; SSE41-NEXT:    pmaxud %xmm3, %xmm6
 ; SSE41-NEXT:    psubd %xmm3, %xmm6
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm7, %xmm2
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm8, %xmm2
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm7, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
 ; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    movapd %xmm8, %xmm2
+; SSE41-NEXT:    movapd %xmm9, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm9
-; SSE41-NEXT:    movdqa %xmm7, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pand %xmm3, %xmm0
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
-; SSE41-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2]
-; SSE41-NEXT:    pmaxud %xmm8, %xmm1
-; SSE41-NEXT:    psubd %xmm8, %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm10
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm8
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT:    pand %xmm8, %xmm7
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm9
+; SSE41-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2]
+; SSE41-NEXT:    pmaxud %xmm9, %xmm1
+; SSE41-NEXT:    psubd %xmm9, %xmm1
 ; SSE41-NEXT:    movdqa %xmm6, %xmm0
 ; SSE41-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index ef62872f51069..9407c5b6aa25c 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -750,14 +750,12 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [42,42]
 ; SSE2-NEXT:    paddq %xmm0, %xmm1
 ; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
@@ -765,14 +763,12 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [42,42]
 ; SSE41-NEXT:    paddq %xmm0, %xmm1
 ; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
 ; SSE41-NEXT:    por %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:

diff  --git a/llvm/test/CodeGen/X86/vec_setcc-2.ll b/llvm/test/CodeGen/X86/vec_setcc-2.ll
index 04b98d2f58b4f..367f9f21041a5 100644
--- a/llvm/test/CodeGen/X86/vec_setcc-2.ll
+++ b/llvm/test/CodeGen/X86/vec_setcc-2.ll
@@ -303,14 +303,12 @@ define <2 x i1> @uge_v2i64_splat(<2 x i64> %x) {
 ; SSE2-LABEL: uge_v2i64_splat:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898]
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147484090,2147484090,2147484090,2147484090]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -379,14 +377,12 @@ define <2 x i1> @ult_v2i64_splat(<2 x i64> %x) {
 ; SSE2-LABEL: ult_v2i64_splat:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898]
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147484090,2147484090,2147484090,2147484090]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_v2i64_splat:

diff  --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
index d182e126bc105..56daf987c829e 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
@@ -16970,15 +16970,10 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_2_v2i64:
@@ -16998,15 +16993,10 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_2_v2i64:
@@ -17022,15 +17012,10 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_2_v2i64:
@@ -17046,15 +17031,10 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_2_v2i64:
@@ -17144,16 +17124,11 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483651,2147483651]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483651,2147483651,2147483651,2147483651]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_3_v2i64:
@@ -17173,16 +17148,11 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483651,2147483651]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483651,2147483651,2147483651,2147483651]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_3_v2i64:
@@ -17198,16 +17168,11 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483651,2147483651]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483651,2147483651,2147483651,2147483651]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_3_v2i64:
@@ -17223,16 +17188,11 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483651,2147483651]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483651,2147483651,2147483651,2147483651]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_3_v2i64:
@@ -17326,15 +17286,10 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_3_v2i64:
@@ -17354,15 +17309,10 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_3_v2i64:
@@ -17378,15 +17328,10 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_3_v2i64:
@@ -17402,15 +17347,10 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_3_v2i64:
@@ -17500,16 +17440,11 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483652,2147483652]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483652,2147483652,2147483652,2147483652]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_4_v2i64:
@@ -17529,16 +17464,11 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483652,2147483652]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483652,2147483652,2147483652,2147483652]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_4_v2i64:
@@ -17554,16 +17484,11 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483652,2147483652]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483652,2147483652,2147483652,2147483652]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_4_v2i64:
@@ -17579,16 +17504,11 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483652,2147483652]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483652,2147483652,2147483652,2147483652]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_4_v2i64:
@@ -17682,15 +17602,10 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_4_v2i64:
@@ -17710,15 +17625,10 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_4_v2i64:
@@ -17734,15 +17644,10 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_4_v2i64:
@@ -17758,15 +17663,10 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_4_v2i64:
@@ -17856,16 +17756,11 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483653,2147483653]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483653,2147483653,2147483653,2147483653]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_5_v2i64:
@@ -17885,16 +17780,11 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483653,2147483653]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483653,2147483653,2147483653,2147483653]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_5_v2i64:
@@ -17910,16 +17800,11 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483653,2147483653]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483653,2147483653,2147483653,2147483653]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_5_v2i64:
@@ -17935,16 +17820,11 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483653,2147483653]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483653,2147483653,2147483653,2147483653]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_5_v2i64:
@@ -18038,15 +17918,10 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_5_v2i64:
@@ -18066,15 +17941,10 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_5_v2i64:
@@ -18090,15 +17960,10 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_5_v2i64:
@@ -18114,15 +17979,10 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_5_v2i64:
@@ -18212,16 +18072,11 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483654,2147483654]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483654,2147483654,2147483654,2147483654]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_6_v2i64:
@@ -18241,16 +18096,11 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483654,2147483654]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483654,2147483654,2147483654,2147483654]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_6_v2i64:
@@ -18266,16 +18116,11 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483654,2147483654]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483654,2147483654,2147483654,2147483654]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_6_v2i64:
@@ -18291,16 +18136,11 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483654,2147483654]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483654,2147483654,2147483654,2147483654]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_6_v2i64:
@@ -18394,15 +18234,10 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_6_v2i64:
@@ -18422,15 +18257,10 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_6_v2i64:
@@ -18446,15 +18276,10 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_6_v2i64:
@@ -18470,15 +18295,10 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_6_v2i64:
@@ -18568,16 +18388,11 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483655,2147483655]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483655,2147483655,2147483655,2147483655]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_7_v2i64:
@@ -18597,16 +18412,11 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483655,2147483655]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483655,2147483655,2147483655,2147483655]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_7_v2i64:
@@ -18622,16 +18432,11 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483655,2147483655]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483655,2147483655,2147483655,2147483655]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_7_v2i64:
@@ -18647,16 +18452,11 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483655,2147483655]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483655,2147483655,2147483655,2147483655]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_7_v2i64:
@@ -18750,15 +18550,10 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_7_v2i64:
@@ -18778,15 +18573,10 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_7_v2i64:
@@ -18802,15 +18592,10 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_7_v2i64:
@@ -18826,15 +18611,10 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_7_v2i64:
@@ -18924,16 +18704,11 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483656,2147483656]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483656,2147483656,2147483656,2147483656]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_8_v2i64:
@@ -18953,16 +18728,11 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483656,2147483656]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483656,2147483656,2147483656,2147483656]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_8_v2i64:
@@ -18978,16 +18748,11 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483656,2147483656]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483656,2147483656,2147483656,2147483656]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_8_v2i64:
@@ -19003,16 +18768,11 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483656,2147483656]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483656,2147483656,2147483656,2147483656]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_8_v2i64:
@@ -19106,15 +18866,10 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_8_v2i64:
@@ -19134,15 +18889,10 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_8_v2i64:
@@ -19158,15 +18908,10 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_8_v2i64:
@@ -19182,15 +18927,10 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_8_v2i64:
@@ -19280,16 +19020,11 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483657,2147483657]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483657,2147483657,2147483657,2147483657]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_9_v2i64:
@@ -19309,16 +19044,11 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483657,2147483657]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483657,2147483657,2147483657,2147483657]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_9_v2i64:
@@ -19334,16 +19064,11 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483657,2147483657]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483657,2147483657,2147483657,2147483657]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_9_v2i64:
@@ -19359,16 +19084,11 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483657,2147483657]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483657,2147483657,2147483657,2147483657]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_9_v2i64:
@@ -19462,15 +19182,10 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_9_v2i64:
@@ -19490,15 +19205,10 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_9_v2i64:
@@ -19514,15 +19224,10 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_9_v2i64:
@@ -19538,15 +19243,10 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_9_v2i64:
@@ -19636,16 +19336,11 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483658,2147483658]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483658,2147483658,2147483658,2147483658]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_10_v2i64:
@@ -19665,16 +19360,11 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483658,2147483658]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483658,2147483658,2147483658,2147483658]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_10_v2i64:
@@ -19690,16 +19380,11 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483658,2147483658]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483658,2147483658,2147483658,2147483658]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_10_v2i64:
@@ -19715,16 +19400,11 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483658,2147483658]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483658,2147483658,2147483658,2147483658]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_10_v2i64:
@@ -19818,15 +19498,10 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_10_v2i64:
@@ -19846,15 +19521,10 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_10_v2i64:
@@ -19870,15 +19540,10 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_10_v2i64:
@@ -19894,15 +19559,10 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_10_v2i64:
@@ -19992,16 +19652,11 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483659,2147483659]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483659,2147483659,2147483659,2147483659]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_11_v2i64:
@@ -20021,16 +19676,11 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483659,2147483659]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483659,2147483659,2147483659,2147483659]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_11_v2i64:
@@ -20046,16 +19696,11 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483659,2147483659]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483659,2147483659,2147483659,2147483659]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_11_v2i64:
@@ -20071,16 +19716,11 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483659,2147483659]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483659,2147483659,2147483659,2147483659]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_11_v2i64:
@@ -20174,15 +19814,10 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_11_v2i64:
@@ -20202,15 +19837,10 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_11_v2i64:
@@ -20226,15 +19856,10 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_11_v2i64:
@@ -20250,15 +19875,10 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_11_v2i64:
@@ -20348,16 +19968,11 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483660,2147483660]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483660,2147483660,2147483660,2147483660]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_12_v2i64:
@@ -20377,16 +19992,11 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483660,2147483660]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483660,2147483660,2147483660,2147483660]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_12_v2i64:
@@ -20402,16 +20012,11 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483660,2147483660]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483660,2147483660,2147483660,2147483660]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_12_v2i64:
@@ -20427,16 +20032,11 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483660,2147483660]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483660,2147483660,2147483660,2147483660]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_12_v2i64:
@@ -20530,15 +20130,10 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_12_v2i64:
@@ -20558,15 +20153,10 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_12_v2i64:
@@ -20582,15 +20172,10 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_12_v2i64:
@@ -20606,15 +20191,10 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_12_v2i64:
@@ -20704,16 +20284,11 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483661,2147483661]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483661,2147483661,2147483661,2147483661]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_13_v2i64:
@@ -20733,16 +20308,11 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483661,2147483661]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483661,2147483661,2147483661,2147483661]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_13_v2i64:
@@ -20758,16 +20328,11 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483661,2147483661]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483661,2147483661,2147483661,2147483661]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_13_v2i64:
@@ -20783,16 +20348,11 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483661,2147483661]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483661,2147483661,2147483661,2147483661]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_13_v2i64:
@@ -20886,15 +20446,10 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_13_v2i64:
@@ -20914,15 +20469,10 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_13_v2i64:
@@ -20938,15 +20488,10 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_13_v2i64:
@@ -20962,15 +20507,10 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_13_v2i64:
@@ -21060,16 +20600,11 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483662,2147483662]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483662,2147483662,2147483662,2147483662]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_14_v2i64:
@@ -21089,16 +20624,11 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483662,2147483662]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483662,2147483662,2147483662,2147483662]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_14_v2i64:
@@ -21114,16 +20644,11 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483662,2147483662]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483662,2147483662,2147483662,2147483662]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_14_v2i64:
@@ -21139,16 +20664,11 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483662,2147483662]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483662,2147483662,2147483662,2147483662]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_14_v2i64:
@@ -21242,15 +20762,10 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_14_v2i64:
@@ -21270,15 +20785,10 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_14_v2i64:
@@ -21294,15 +20804,10 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_14_v2i64:
@@ -21318,15 +20823,10 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_14_v2i64:
@@ -21416,16 +20916,11 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483663,2147483663]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483663,2147483663,2147483663,2147483663]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_15_v2i64:
@@ -21445,16 +20940,11 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483663,2147483663]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483663,2147483663,2147483663,2147483663]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_15_v2i64:
@@ -21470,16 +20960,11 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483663,2147483663]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483663,2147483663,2147483663,2147483663]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_15_v2i64:
@@ -21495,16 +20980,11 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483663,2147483663]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483663,2147483663,2147483663,2147483663]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_15_v2i64:
@@ -21598,15 +21078,10 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_15_v2i64:
@@ -21626,15 +21101,10 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_15_v2i64:
@@ -21650,15 +21120,10 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_15_v2i64:
@@ -21674,15 +21139,10 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_15_v2i64:
@@ -21772,16 +21232,11 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483664,2147483664]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483664,2147483664,2147483664,2147483664]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_16_v2i64:
@@ -21801,16 +21256,11 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483664,2147483664]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483664,2147483664,2147483664,2147483664]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_16_v2i64:
@@ -21826,16 +21276,11 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483664,2147483664]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483664,2147483664,2147483664,2147483664]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_16_v2i64:
@@ -21851,16 +21296,11 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483664,2147483664]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483664,2147483664,2147483664,2147483664]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_16_v2i64:
@@ -21954,15 +21394,10 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_16_v2i64:
@@ -21982,15 +21417,10 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_16_v2i64:
@@ -22006,15 +21436,10 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_16_v2i64:
@@ -22030,15 +21455,10 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_16_v2i64:
@@ -22128,16 +21548,11 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483665,2147483665]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483665,2147483665,2147483665,2147483665]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_17_v2i64:
@@ -22157,16 +21572,11 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483665,2147483665]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483665,2147483665,2147483665,2147483665]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_17_v2i64:
@@ -22182,16 +21592,11 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483665,2147483665]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483665,2147483665,2147483665,2147483665]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_17_v2i64:
@@ -22207,16 +21612,11 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483665,2147483665]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483665,2147483665,2147483665,2147483665]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_17_v2i64:
@@ -22310,15 +21710,10 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_17_v2i64:
@@ -22338,15 +21733,10 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_17_v2i64:
@@ -22362,15 +21752,10 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_17_v2i64:
@@ -22386,15 +21771,10 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_17_v2i64:
@@ -22484,16 +21864,11 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483666,2147483666]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483666,2147483666,2147483666,2147483666]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_18_v2i64:
@@ -22513,16 +21888,11 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483666,2147483666]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483666,2147483666,2147483666,2147483666]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_18_v2i64:
@@ -22538,16 +21908,11 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483666,2147483666]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483666,2147483666,2147483666,2147483666]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_18_v2i64:
@@ -22563,16 +21928,11 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483666,2147483666]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483666,2147483666,2147483666,2147483666]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_18_v2i64:
@@ -22666,15 +22026,10 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_18_v2i64:
@@ -22694,15 +22049,10 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_18_v2i64:
@@ -22718,15 +22068,10 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_18_v2i64:
@@ -22742,15 +22087,10 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_18_v2i64:
@@ -22840,16 +22180,11 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483667,2147483667]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483667,2147483667,2147483667,2147483667]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_19_v2i64:
@@ -22869,16 +22204,11 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483667,2147483667]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483667,2147483667,2147483667,2147483667]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_19_v2i64:
@@ -22894,16 +22224,11 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483667,2147483667]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483667,2147483667,2147483667,2147483667]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_19_v2i64:
@@ -22919,16 +22244,11 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483667,2147483667]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483667,2147483667,2147483667,2147483667]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_19_v2i64:
@@ -23022,15 +22342,10 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_19_v2i64:
@@ -23050,15 +22365,10 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_19_v2i64:
@@ -23074,15 +22384,10 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_19_v2i64:
@@ -23098,15 +22403,10 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_19_v2i64:
@@ -23196,16 +22496,11 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483668,2147483668]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483668,2147483668,2147483668,2147483668]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_20_v2i64:
@@ -23225,16 +22520,11 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483668,2147483668]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483668,2147483668,2147483668,2147483668]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_20_v2i64:
@@ -23250,16 +22540,11 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483668,2147483668]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483668,2147483668,2147483668,2147483668]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_20_v2i64:
@@ -23275,16 +22560,11 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483668,2147483668]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483668,2147483668,2147483668,2147483668]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_20_v2i64:
@@ -23378,15 +22658,10 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_20_v2i64:
@@ -23406,15 +22681,10 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_20_v2i64:
@@ -23430,15 +22700,10 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_20_v2i64:
@@ -23454,15 +22719,10 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_20_v2i64:
@@ -23552,16 +22812,11 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483669,2147483669]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483669,2147483669,2147483669,2147483669]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_21_v2i64:
@@ -23581,16 +22836,11 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483669,2147483669]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483669,2147483669,2147483669,2147483669]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_21_v2i64:
@@ -23606,16 +22856,11 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483669,2147483669]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483669,2147483669,2147483669,2147483669]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_21_v2i64:
@@ -23631,16 +22876,11 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483669,2147483669]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483669,2147483669,2147483669,2147483669]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_21_v2i64:
@@ -23734,15 +22974,10 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_21_v2i64:
@@ -23762,15 +22997,10 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_21_v2i64:
@@ -23786,15 +23016,10 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_21_v2i64:
@@ -23810,15 +23035,10 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_21_v2i64:
@@ -23908,16 +23128,11 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483670,2147483670]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483670,2147483670,2147483670,2147483670]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_22_v2i64:
@@ -23937,16 +23152,11 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483670,2147483670]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483670,2147483670,2147483670,2147483670]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_22_v2i64:
@@ -23962,16 +23172,11 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483670,2147483670]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483670,2147483670,2147483670,2147483670]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_22_v2i64:
@@ -23987,16 +23192,11 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483670,2147483670]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483670,2147483670,2147483670,2147483670]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_22_v2i64:
@@ -24090,15 +23290,10 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_22_v2i64:
@@ -24118,15 +23313,10 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_22_v2i64:
@@ -24142,15 +23332,10 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_22_v2i64:
@@ -24166,15 +23351,10 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_22_v2i64:
@@ -24264,16 +23444,11 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483671,2147483671]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483671,2147483671,2147483671,2147483671]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_23_v2i64:
@@ -24293,16 +23468,11 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483671,2147483671]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483671,2147483671,2147483671,2147483671]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_23_v2i64:
@@ -24318,16 +23488,11 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483671,2147483671]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483671,2147483671,2147483671,2147483671]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_23_v2i64:
@@ -24343,16 +23508,11 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483671,2147483671]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483671,2147483671,2147483671,2147483671]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_23_v2i64:
@@ -24446,15 +23606,10 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_23_v2i64:
@@ -24474,15 +23629,10 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_23_v2i64:
@@ -24498,15 +23648,10 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_23_v2i64:
@@ -24522,15 +23667,10 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_23_v2i64:
@@ -24620,16 +23760,11 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483672,2147483672]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483672,2147483672,2147483672,2147483672]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_24_v2i64:
@@ -24649,16 +23784,11 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483672,2147483672]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483672,2147483672,2147483672,2147483672]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_24_v2i64:
@@ -24674,16 +23804,11 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483672,2147483672]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483672,2147483672,2147483672,2147483672]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_24_v2i64:
@@ -24699,16 +23824,11 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483672,2147483672]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483672,2147483672,2147483672,2147483672]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_24_v2i64:
@@ -24802,15 +23922,10 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_24_v2i64:
@@ -24830,15 +23945,10 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_24_v2i64:
@@ -24854,15 +23964,10 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_24_v2i64:
@@ -24878,15 +23983,10 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_24_v2i64:
@@ -24976,16 +24076,11 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483673,2147483673]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483673,2147483673,2147483673,2147483673]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_25_v2i64:
@@ -25005,16 +24100,11 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483673,2147483673]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483673,2147483673,2147483673,2147483673]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_25_v2i64:
@@ -25030,16 +24120,11 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483673,2147483673]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483673,2147483673,2147483673,2147483673]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_25_v2i64:
@@ -25055,16 +24140,11 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483673,2147483673]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483673,2147483673,2147483673,2147483673]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_25_v2i64:
@@ -25158,15 +24238,10 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_25_v2i64:
@@ -25186,15 +24261,10 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_25_v2i64:
@@ -25210,15 +24280,10 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_25_v2i64:
@@ -25234,15 +24299,10 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_25_v2i64:
@@ -25332,16 +24392,11 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483674,2147483674]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483674,2147483674,2147483674,2147483674]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_26_v2i64:
@@ -25361,16 +24416,11 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483674,2147483674]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483674,2147483674,2147483674,2147483674]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_26_v2i64:
@@ -25386,16 +24436,11 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483674,2147483674]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483674,2147483674,2147483674,2147483674]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_26_v2i64:
@@ -25411,16 +24456,11 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483674,2147483674]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483674,2147483674,2147483674,2147483674]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_26_v2i64:
@@ -25514,15 +24554,10 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_26_v2i64:
@@ -25542,15 +24577,10 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_26_v2i64:
@@ -25566,15 +24596,10 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_26_v2i64:
@@ -25590,15 +24615,10 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_26_v2i64:
@@ -25688,16 +24708,11 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483675,2147483675]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483675,2147483675,2147483675,2147483675]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_27_v2i64:
@@ -25717,16 +24732,11 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483675,2147483675]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483675,2147483675,2147483675,2147483675]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_27_v2i64:
@@ -25742,16 +24752,11 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483675,2147483675]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483675,2147483675,2147483675,2147483675]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_27_v2i64:
@@ -25767,16 +24772,11 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483675,2147483675]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483675,2147483675,2147483675,2147483675]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_27_v2i64:
@@ -25870,15 +24870,10 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_27_v2i64:
@@ -25898,15 +24893,10 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_27_v2i64:
@@ -25922,15 +24912,10 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_27_v2i64:
@@ -25946,15 +24931,10 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_27_v2i64:
@@ -26044,16 +25024,11 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483676,2147483676]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483676,2147483676,2147483676,2147483676]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_28_v2i64:
@@ -26073,16 +25048,11 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483676,2147483676]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483676,2147483676,2147483676,2147483676]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_28_v2i64:
@@ -26098,16 +25068,11 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483676,2147483676]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483676,2147483676,2147483676,2147483676]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_28_v2i64:
@@ -26123,16 +25088,11 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483676,2147483676]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483676,2147483676,2147483676,2147483676]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_28_v2i64:
@@ -26226,15 +25186,10 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_28_v2i64:
@@ -26254,15 +25209,10 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_28_v2i64:
@@ -26278,15 +25228,10 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_28_v2i64:
@@ -26302,15 +25247,10 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_28_v2i64:
@@ -26400,16 +25340,11 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483677,2147483677]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483677,2147483677,2147483677,2147483677]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_29_v2i64:
@@ -26429,16 +25364,11 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483677,2147483677]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483677,2147483677,2147483677,2147483677]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_29_v2i64:
@@ -26454,16 +25384,11 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483677,2147483677]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483677,2147483677,2147483677,2147483677]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_29_v2i64:
@@ -26479,16 +25404,11 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483677,2147483677]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483677,2147483677,2147483677,2147483677]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_29_v2i64:
@@ -26582,15 +25502,10 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_29_v2i64:
@@ -26610,15 +25525,10 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_29_v2i64:
@@ -26634,15 +25544,10 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_29_v2i64:
@@ -26658,15 +25563,10 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_29_v2i64:
@@ -26756,16 +25656,11 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483678,2147483678]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483678,2147483678,2147483678,2147483678]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_30_v2i64:
@@ -26785,16 +25680,11 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483678,2147483678]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483678,2147483678,2147483678,2147483678]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_30_v2i64:
@@ -26810,16 +25700,11 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483678,2147483678]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483678,2147483678,2147483678,2147483678]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_30_v2i64:
@@ -26835,16 +25720,11 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483678,2147483678]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483678,2147483678,2147483678,2147483678]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_30_v2i64:
@@ -26938,15 +25818,10 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_30_v2i64:
@@ -26966,15 +25841,10 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_30_v2i64:
@@ -26990,15 +25860,10 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_30_v2i64:
@@ -27014,15 +25879,10 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_30_v2i64:
@@ -27112,16 +25972,11 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483679,2147483679]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483679,2147483679,2147483679,2147483679]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_31_v2i64:
@@ -27141,16 +25996,11 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483679,2147483679]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483679,2147483679,2147483679,2147483679]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_31_v2i64:
@@ -27166,16 +26016,11 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483679,2147483679]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483679,2147483679,2147483679,2147483679]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_31_v2i64:
@@ -27191,16 +26036,11 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483679,2147483679]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483679,2147483679,2147483679,2147483679]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_31_v2i64:
@@ -27294,15 +26134,10 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_31_v2i64:
@@ -27322,15 +26157,10 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_31_v2i64:
@@ -27346,15 +26176,10 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_31_v2i64:
@@ -27370,15 +26195,10 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_31_v2i64:
@@ -27468,16 +26288,11 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483680,2147483680]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483680,2147483680,2147483680,2147483680]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_32_v2i64:
@@ -27497,16 +26312,11 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483680,2147483680]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483680,2147483680,2147483680,2147483680]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_32_v2i64:
@@ -27522,16 +26332,11 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483680,2147483680]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483680,2147483680,2147483680,2147483680]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_32_v2i64:
@@ -27547,16 +26352,11 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483680,2147483680]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483680,2147483680,2147483680,2147483680]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_32_v2i64:
@@ -27650,15 +26450,10 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_32_v2i64:
@@ -27678,15 +26473,10 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_32_v2i64:
@@ -27702,15 +26492,10 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_32_v2i64:
@@ -27726,15 +26511,10 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_32_v2i64:
@@ -27824,16 +26604,11 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483681,2147483681]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483681,2147483681,2147483681,2147483681]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_33_v2i64:
@@ -27853,16 +26628,11 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483681,2147483681]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483681,2147483681,2147483681,2147483681]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_33_v2i64:
@@ -27878,16 +26648,11 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483681,2147483681]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483681,2147483681,2147483681,2147483681]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_33_v2i64:
@@ -27903,16 +26668,11 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483681,2147483681]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483681,2147483681,2147483681,2147483681]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_33_v2i64:
@@ -28006,15 +26766,10 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_33_v2i64:
@@ -28034,15 +26789,10 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_33_v2i64:
@@ -28058,15 +26808,10 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_33_v2i64:
@@ -28082,15 +26827,10 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_33_v2i64:
@@ -28180,16 +26920,11 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483682,2147483682]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483682,2147483682,2147483682,2147483682]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_34_v2i64:
@@ -28209,16 +26944,11 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483682,2147483682]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483682,2147483682,2147483682,2147483682]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_34_v2i64:
@@ -28234,16 +26964,11 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483682,2147483682]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483682,2147483682,2147483682,2147483682]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_34_v2i64:
@@ -28259,16 +26984,11 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483682,2147483682]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483682,2147483682,2147483682,2147483682]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_34_v2i64:
@@ -28362,15 +27082,10 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_34_v2i64:
@@ -28390,15 +27105,10 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_34_v2i64:
@@ -28414,15 +27124,10 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_34_v2i64:
@@ -28438,15 +27143,10 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_34_v2i64:
@@ -28536,16 +27236,11 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483683,2147483683]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483683,2147483683,2147483683,2147483683]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_35_v2i64:
@@ -28565,16 +27260,11 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483683,2147483683]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483683,2147483683,2147483683,2147483683]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_35_v2i64:
@@ -28590,16 +27280,11 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483683,2147483683]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483683,2147483683,2147483683,2147483683]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_35_v2i64:
@@ -28615,16 +27300,11 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483683,2147483683]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483683,2147483683,2147483683,2147483683]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_35_v2i64:
@@ -28718,15 +27398,10 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_35_v2i64:
@@ -28746,15 +27421,10 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_35_v2i64:
@@ -28770,15 +27440,10 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_35_v2i64:
@@ -28794,15 +27459,10 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_35_v2i64:
@@ -28892,16 +27552,11 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483684,2147483684]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483684,2147483684,2147483684,2147483684]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_36_v2i64:
@@ -28921,16 +27576,11 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483684,2147483684]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483684,2147483684,2147483684,2147483684]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_36_v2i64:
@@ -28946,16 +27596,11 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483684,2147483684]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483684,2147483684,2147483684,2147483684]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_36_v2i64:
@@ -28971,16 +27616,11 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483684,2147483684]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483684,2147483684,2147483684,2147483684]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_36_v2i64:
@@ -29074,15 +27714,10 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_36_v2i64:
@@ -29102,15 +27737,10 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_36_v2i64:
@@ -29126,15 +27756,10 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_36_v2i64:
@@ -29150,15 +27775,10 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_36_v2i64:
@@ -29248,16 +27868,11 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483685,2147483685]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483685,2147483685,2147483685,2147483685]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_37_v2i64:
@@ -29277,16 +27892,11 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483685,2147483685]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483685,2147483685,2147483685,2147483685]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_37_v2i64:
@@ -29302,16 +27912,11 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483685,2147483685]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483685,2147483685,2147483685,2147483685]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_37_v2i64:
@@ -29327,16 +27932,11 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483685,2147483685]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483685,2147483685,2147483685,2147483685]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_37_v2i64:
@@ -29430,15 +28030,10 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_37_v2i64:
@@ -29458,15 +28053,10 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_37_v2i64:
@@ -29482,15 +28072,10 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_37_v2i64:
@@ -29506,15 +28091,10 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_37_v2i64:
@@ -29604,16 +28184,11 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483686,2147483686]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483686,2147483686,2147483686,2147483686]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_38_v2i64:
@@ -29633,16 +28208,11 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483686,2147483686]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483686,2147483686,2147483686,2147483686]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_38_v2i64:
@@ -29658,16 +28228,11 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483686,2147483686]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483686,2147483686,2147483686,2147483686]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_38_v2i64:
@@ -29683,16 +28248,11 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483686,2147483686]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483686,2147483686,2147483686,2147483686]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_38_v2i64:
@@ -29786,15 +28346,10 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_38_v2i64:
@@ -29814,15 +28369,10 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_38_v2i64:
@@ -29838,15 +28388,10 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_38_v2i64:
@@ -29862,15 +28407,10 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_38_v2i64:
@@ -29960,16 +28500,11 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483687,2147483687]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483687,2147483687,2147483687,2147483687]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_39_v2i64:
@@ -29989,16 +28524,11 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483687,2147483687]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483687,2147483687,2147483687,2147483687]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_39_v2i64:
@@ -30014,16 +28544,11 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483687,2147483687]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483687,2147483687,2147483687,2147483687]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_39_v2i64:
@@ -30039,16 +28564,11 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483687,2147483687]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483687,2147483687,2147483687,2147483687]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_39_v2i64:
@@ -30142,15 +28662,10 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_39_v2i64:
@@ -30170,15 +28685,10 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_39_v2i64:
@@ -30194,15 +28704,10 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_39_v2i64:
@@ -30218,15 +28723,10 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_39_v2i64:
@@ -30316,16 +28816,11 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483688,2147483688]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483688,2147483688,2147483688,2147483688]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_40_v2i64:
@@ -30345,16 +28840,11 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483688,2147483688]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483688,2147483688,2147483688,2147483688]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_40_v2i64:
@@ -30370,16 +28860,11 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483688,2147483688]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483688,2147483688,2147483688,2147483688]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_40_v2i64:
@@ -30395,16 +28880,11 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483688,2147483688]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483688,2147483688,2147483688,2147483688]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_40_v2i64:
@@ -30498,15 +28978,10 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_40_v2i64:
@@ -30526,15 +29001,10 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_40_v2i64:
@@ -30550,15 +29020,10 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_40_v2i64:
@@ -30574,15 +29039,10 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_40_v2i64:
@@ -30672,16 +29132,11 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483689,2147483689]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483689,2147483689,2147483689,2147483689]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_41_v2i64:
@@ -30701,16 +29156,11 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483689,2147483689]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483689,2147483689,2147483689,2147483689]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_41_v2i64:
@@ -30726,16 +29176,11 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483689,2147483689]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483689,2147483689,2147483689,2147483689]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_41_v2i64:
@@ -30751,16 +29196,11 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483689,2147483689]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483689,2147483689,2147483689,2147483689]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_41_v2i64:
@@ -30854,15 +29294,10 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_41_v2i64:
@@ -30882,15 +29317,10 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_41_v2i64:
@@ -30906,15 +29336,10 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_41_v2i64:
@@ -30930,15 +29355,10 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_41_v2i64:
@@ -31028,16 +29448,11 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483690,2147483690]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483690,2147483690,2147483690,2147483690]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_42_v2i64:
@@ -31057,16 +29472,11 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483690,2147483690]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483690,2147483690,2147483690,2147483690]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_42_v2i64:
@@ -31082,16 +29492,11 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483690,2147483690]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483690,2147483690,2147483690,2147483690]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_42_v2i64:
@@ -31107,16 +29512,11 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483690,2147483690]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483690,2147483690,2147483690,2147483690]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_42_v2i64:
@@ -31210,15 +29610,10 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_42_v2i64:
@@ -31238,15 +29633,10 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_42_v2i64:
@@ -31262,15 +29652,10 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_42_v2i64:
@@ -31286,15 +29671,10 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_42_v2i64:
@@ -31384,16 +29764,11 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483691,2147483691]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483691,2147483691,2147483691,2147483691]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_43_v2i64:
@@ -31413,16 +29788,11 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483691,2147483691]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483691,2147483691,2147483691,2147483691]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_43_v2i64:
@@ -31438,16 +29808,11 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483691,2147483691]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483691,2147483691,2147483691,2147483691]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_43_v2i64:
@@ -31463,16 +29828,11 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483691,2147483691]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483691,2147483691,2147483691,2147483691]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_43_v2i64:
@@ -31566,15 +29926,10 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_43_v2i64:
@@ -31594,15 +29949,10 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_43_v2i64:
@@ -31618,15 +29968,10 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_43_v2i64:
@@ -31642,15 +29987,10 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_43_v2i64:
@@ -31740,16 +30080,11 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483692,2147483692]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483692,2147483692,2147483692,2147483692]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_44_v2i64:
@@ -31769,16 +30104,11 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483692,2147483692]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483692,2147483692,2147483692,2147483692]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_44_v2i64:
@@ -31794,16 +30124,11 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483692,2147483692]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483692,2147483692,2147483692,2147483692]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_44_v2i64:
@@ -31819,16 +30144,11 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483692,2147483692]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483692,2147483692,2147483692,2147483692]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_44_v2i64:
@@ -31921,16 +30241,11 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    psrlw $4, %xmm1
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_44_v2i64:
@@ -31950,15 +30265,10 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_44_v2i64:
@@ -31974,15 +30284,10 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_44_v2i64:
@@ -31998,15 +30303,10 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_44_v2i64:
@@ -32096,16 +30396,11 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483693,2147483693]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483693,2147483693,2147483693,2147483693]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_45_v2i64:
@@ -32125,16 +30420,11 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483693,2147483693]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483693,2147483693,2147483693,2147483693]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_45_v2i64:
@@ -32150,16 +30440,11 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483693,2147483693]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483693,2147483693,2147483693,2147483693]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_45_v2i64:
@@ -32175,16 +30460,11 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483693,2147483693]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483693,2147483693,2147483693,2147483693]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_45_v2i64:
@@ -32278,15 +30558,10 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_45_v2i64:
@@ -32306,15 +30581,10 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_45_v2i64:
@@ -32330,15 +30600,10 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_45_v2i64:
@@ -32354,15 +30619,10 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_45_v2i64:
@@ -32452,16 +30712,11 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483694,2147483694]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483694,2147483694,2147483694,2147483694]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_46_v2i64:
@@ -32481,16 +30736,11 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483694,2147483694]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483694,2147483694,2147483694,2147483694]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_46_v2i64:
@@ -32506,16 +30756,11 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483694,2147483694]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483694,2147483694,2147483694,2147483694]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_46_v2i64:
@@ -32531,16 +30776,11 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483694,2147483694]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483694,2147483694,2147483694,2147483694]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_46_v2i64:
@@ -32634,15 +30874,10 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_46_v2i64:
@@ -32662,15 +30897,10 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_46_v2i64:
@@ -32686,15 +30916,10 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_46_v2i64:
@@ -32710,15 +30935,10 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_46_v2i64:
@@ -32808,16 +31028,11 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483695,2147483695]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483695,2147483695,2147483695,2147483695]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_47_v2i64:
@@ -32837,16 +31052,11 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483695,2147483695]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483695,2147483695,2147483695,2147483695]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_47_v2i64:
@@ -32862,16 +31072,11 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483695,2147483695]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483695,2147483695,2147483695,2147483695]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_47_v2i64:
@@ -32887,16 +31092,11 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483695,2147483695]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483695,2147483695,2147483695,2147483695]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_47_v2i64:
@@ -32990,15 +31190,10 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_47_v2i64:
@@ -33018,15 +31213,10 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_47_v2i64:
@@ -33042,15 +31232,10 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_47_v2i64:
@@ -33066,15 +31251,10 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_47_v2i64:
@@ -33164,16 +31344,11 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483696,2147483696]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483696,2147483696,2147483696,2147483696]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_48_v2i64:
@@ -33193,16 +31368,11 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483696,2147483696]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483696,2147483696,2147483696,2147483696]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_48_v2i64:
@@ -33218,16 +31388,11 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483696,2147483696]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483696,2147483696,2147483696,2147483696]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_48_v2i64:
@@ -33243,16 +31408,11 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483696,2147483696]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483696,2147483696,2147483696,2147483696]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_48_v2i64:
@@ -33346,15 +31506,10 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_48_v2i64:
@@ -33374,15 +31529,10 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_48_v2i64:
@@ -33398,15 +31548,10 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_48_v2i64:
@@ -33422,15 +31567,10 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_48_v2i64:
@@ -33520,16 +31660,11 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483697,2147483697]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483697,2147483697,2147483697,2147483697]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_49_v2i64:
@@ -33549,16 +31684,11 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483697,2147483697]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483697,2147483697,2147483697,2147483697]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_49_v2i64:
@@ -33574,16 +31704,11 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483697,2147483697]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483697,2147483697,2147483697,2147483697]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_49_v2i64:
@@ -33599,16 +31724,11 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483697,2147483697]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483697,2147483697,2147483697,2147483697]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_49_v2i64:
@@ -33702,15 +31822,10 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_49_v2i64:
@@ -33730,15 +31845,10 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_49_v2i64:
@@ -33754,15 +31864,10 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_49_v2i64:
@@ -33778,15 +31883,10 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_49_v2i64:
@@ -33876,16 +31976,11 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483698,2147483698]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483698,2147483698,2147483698,2147483698]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_50_v2i64:
@@ -33905,16 +32000,11 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483698,2147483698]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483698,2147483698,2147483698,2147483698]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_50_v2i64:
@@ -33930,16 +32020,11 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483698,2147483698]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483698,2147483698,2147483698,2147483698]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_50_v2i64:
@@ -33955,16 +32040,11 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483698,2147483698]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483698,2147483698,2147483698,2147483698]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_50_v2i64:
@@ -34058,15 +32138,10 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_50_v2i64:
@@ -34086,15 +32161,10 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_50_v2i64:
@@ -34110,15 +32180,10 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_50_v2i64:
@@ -34134,15 +32199,10 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_50_v2i64:
@@ -34232,16 +32292,11 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483699,2147483699]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483699,2147483699,2147483699,2147483699]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_51_v2i64:
@@ -34261,16 +32316,11 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483699,2147483699]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483699,2147483699,2147483699,2147483699]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_51_v2i64:
@@ -34286,16 +32336,11 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483699,2147483699]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483699,2147483699,2147483699,2147483699]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_51_v2i64:
@@ -34311,16 +32356,11 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483699,2147483699]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483699,2147483699,2147483699,2147483699]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_51_v2i64:
@@ -34414,15 +32454,10 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_51_v2i64:
@@ -34442,15 +32477,10 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_51_v2i64:
@@ -34466,15 +32496,10 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_51_v2i64:
@@ -34490,15 +32515,10 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_51_v2i64:
@@ -34588,16 +32608,11 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483700,2147483700]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483700,2147483700,2147483700,2147483700]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_52_v2i64:
@@ -34617,16 +32632,11 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483700,2147483700]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483700,2147483700,2147483700,2147483700]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_52_v2i64:
@@ -34642,16 +32652,11 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483700,2147483700]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483700,2147483700,2147483700,2147483700]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_52_v2i64:
@@ -34667,16 +32672,11 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483700,2147483700]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483700,2147483700,2147483700,2147483700]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_52_v2i64:
@@ -34770,15 +32770,10 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_52_v2i64:
@@ -34798,15 +32793,10 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_52_v2i64:
@@ -34822,15 +32812,10 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_52_v2i64:
@@ -34846,15 +32831,10 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_52_v2i64:
@@ -34944,16 +32924,11 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483701,2147483701]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483701,2147483701,2147483701,2147483701]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_53_v2i64:
@@ -34973,16 +32948,11 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483701,2147483701]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483701,2147483701,2147483701,2147483701]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_53_v2i64:
@@ -34998,16 +32968,11 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483701,2147483701]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483701,2147483701,2147483701,2147483701]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_53_v2i64:
@@ -35023,16 +32988,11 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483701,2147483701]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483701,2147483701,2147483701,2147483701]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_53_v2i64:
@@ -35126,15 +33086,10 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_53_v2i64:
@@ -35154,15 +33109,10 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_53_v2i64:
@@ -35178,15 +33128,10 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_53_v2i64:
@@ -35202,15 +33147,10 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_53_v2i64:
@@ -35300,16 +33240,11 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483702,2147483702]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483702,2147483702,2147483702,2147483702]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_54_v2i64:
@@ -35329,16 +33264,11 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483702,2147483702]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483702,2147483702,2147483702,2147483702]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_54_v2i64:
@@ -35354,16 +33284,11 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483702,2147483702]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483702,2147483702,2147483702,2147483702]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_54_v2i64:
@@ -35379,16 +33304,11 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483702,2147483702]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483702,2147483702,2147483702,2147483702]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_54_v2i64:
@@ -35482,15 +33402,10 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_54_v2i64:
@@ -35510,15 +33425,10 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_54_v2i64:
@@ -35534,15 +33444,10 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_54_v2i64:
@@ -35558,15 +33463,10 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_54_v2i64:
@@ -35656,16 +33556,11 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483703,2147483703]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483703,2147483703,2147483703,2147483703]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_55_v2i64:
@@ -35685,16 +33580,11 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483703,2147483703]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483703,2147483703,2147483703,2147483703]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_55_v2i64:
@@ -35710,16 +33600,11 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483703,2147483703]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483703,2147483703,2147483703,2147483703]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_55_v2i64:
@@ -35735,16 +33620,11 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483703,2147483703]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483703,2147483703,2147483703,2147483703]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_55_v2i64:
@@ -35838,15 +33718,10 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_55_v2i64:
@@ -35866,15 +33741,10 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_55_v2i64:
@@ -35890,15 +33760,10 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_55_v2i64:
@@ -35914,15 +33779,10 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_55_v2i64:
@@ -36012,16 +33872,11 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483704,2147483704]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483704,2147483704,2147483704,2147483704]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_56_v2i64:
@@ -36041,16 +33896,11 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483704,2147483704]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483704,2147483704,2147483704,2147483704]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_56_v2i64:
@@ -36066,16 +33916,11 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483704,2147483704]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483704,2147483704,2147483704,2147483704]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_56_v2i64:
@@ -36091,16 +33936,11 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483704,2147483704]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483704,2147483704,2147483704,2147483704]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_56_v2i64:
@@ -36194,15 +34034,10 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_56_v2i64:
@@ -36222,15 +34057,10 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_56_v2i64:
@@ -36246,15 +34076,10 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_56_v2i64:
@@ -36270,15 +34095,10 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_56_v2i64:
@@ -36368,16 +34188,11 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483705,2147483705]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483705,2147483705,2147483705,2147483705]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_57_v2i64:
@@ -36397,16 +34212,11 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483705,2147483705]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483705,2147483705,2147483705,2147483705]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_57_v2i64:
@@ -36422,16 +34232,11 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483705,2147483705]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483705,2147483705,2147483705,2147483705]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_57_v2i64:
@@ -36447,16 +34252,11 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483705,2147483705]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483705,2147483705,2147483705,2147483705]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_57_v2i64:
@@ -36550,15 +34350,10 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_57_v2i64:
@@ -36578,15 +34373,10 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_57_v2i64:
@@ -36602,15 +34392,10 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_57_v2i64:
@@ -36626,15 +34411,10 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_57_v2i64:
@@ -36724,16 +34504,11 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483706,2147483706]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483706,2147483706,2147483706,2147483706]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_58_v2i64:
@@ -36753,16 +34528,11 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483706,2147483706]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483706,2147483706,2147483706,2147483706]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_58_v2i64:
@@ -36778,16 +34548,11 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483706,2147483706]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483706,2147483706,2147483706,2147483706]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_58_v2i64:
@@ -36803,16 +34568,11 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483706,2147483706]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483706,2147483706,2147483706,2147483706]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_58_v2i64:
@@ -36906,15 +34666,10 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_58_v2i64:
@@ -36934,15 +34689,10 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_58_v2i64:
@@ -36958,15 +34708,10 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_58_v2i64:
@@ -36982,15 +34727,10 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_58_v2i64:
@@ -37080,16 +34820,11 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483707,2147483707]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483707,2147483707,2147483707,2147483707]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_59_v2i64:
@@ -37109,16 +34844,11 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483707,2147483707]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483707,2147483707,2147483707,2147483707]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_59_v2i64:
@@ -37134,16 +34864,11 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483707,2147483707]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483707,2147483707,2147483707,2147483707]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_59_v2i64:
@@ -37159,16 +34884,11 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483707,2147483707]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483707,2147483707,2147483707,2147483707]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_59_v2i64:
@@ -37262,15 +34982,10 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_59_v2i64:
@@ -37290,15 +35005,10 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_59_v2i64:
@@ -37314,15 +35024,10 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_59_v2i64:
@@ -37338,15 +35043,10 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_59_v2i64:
@@ -37436,16 +35136,11 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483708,2147483708]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483708,2147483708,2147483708,2147483708]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_60_v2i64:
@@ -37465,16 +35160,11 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483708,2147483708]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483708,2147483708,2147483708,2147483708]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_60_v2i64:
@@ -37490,16 +35180,11 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483708,2147483708]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483708,2147483708,2147483708,2147483708]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_60_v2i64:
@@ -37515,16 +35200,11 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483708,2147483708]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483708,2147483708,2147483708,2147483708]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_60_v2i64:
@@ -37618,15 +35298,10 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_60_v2i64:
@@ -37646,15 +35321,10 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_60_v2i64:
@@ -37670,15 +35340,10 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_60_v2i64:
@@ -37694,15 +35359,10 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_60_v2i64:
@@ -37792,16 +35452,11 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483709,2147483709]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483709,2147483709,2147483709,2147483709]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_61_v2i64:
@@ -37821,16 +35476,11 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483709,2147483709]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483709,2147483709,2147483709,2147483709]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_61_v2i64:
@@ -37846,16 +35496,11 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483709,2147483709]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483709,2147483709,2147483709,2147483709]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_61_v2i64:
@@ -37871,16 +35516,11 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483709,2147483709]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483709,2147483709,2147483709,2147483709]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_61_v2i64:
@@ -37974,15 +35614,10 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_61_v2i64:
@@ -38002,15 +35637,10 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_61_v2i64:
@@ -38026,15 +35656,10 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_61_v2i64:
@@ -38050,15 +35675,10 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_61_v2i64:
@@ -38148,16 +35768,11 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483710,2147483710]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483710,2147483710,2147483710,2147483710]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_62_v2i64:
@@ -38177,16 +35792,11 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483710,2147483710]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483710,2147483710,2147483710,2147483710]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_62_v2i64:
@@ -38202,16 +35812,11 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483710,2147483710]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483710,2147483710,2147483710,2147483710]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_62_v2i64:
@@ -38227,16 +35832,11 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483710,2147483710]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483710,2147483710,2147483710,2147483710]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_62_v2i64:
@@ -38330,15 +35930,10 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_62_v2i64:
@@ -38358,15 +35953,10 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-NEXT:    por %xmm3, %xmm0
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_62_v2i64:
@@ -38382,15 +35972,10 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_62_v2i64:
@@ -38406,15 +35991,10 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ugt_62_v2i64:
@@ -38504,16 +36084,11 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483711,2147483711]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483711,2147483711,2147483711,2147483711]
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_63_v2i64:
@@ -38533,16 +36108,11 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
 ; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
 ; SSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483711,2147483711]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483711,2147483711,2147483711,2147483711]
 ; SSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_63_v2i64:
@@ -38558,16 +36128,11 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    psadbw %xmm0, %xmm3
-; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483711,2147483711]
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483711,2147483711,2147483711,2147483711]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_63_v2i64:
@@ -38583,16 +36148,11 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm0, %xmm3
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483711,2147483711]
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483711,2147483711,2147483711,2147483711]
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ult_63_v2i64:

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 04b3de6af98e8..51b013c63b70c 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -24,36 +24,32 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_usat_v2i64_v2i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    pxor %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455]
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm2
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT:    por %xmm0, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_usat_v2i64_v2i32:
@@ -62,13 +58,11 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
 ; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [4294967295,4294967295]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE41-NEXT:    retq
@@ -124,18 +118,16 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    movq %xmm0, (%rdi)
 ; SSE2-NEXT:    retq
 ;
@@ -143,18 +135,16 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    pxor %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455]
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm2
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT:    por %xmm0, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSSE3-NEXT:    movq %xmm0, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
@@ -164,13 +154,11 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
 ; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [4294967295,4294967295]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE41-NEXT:    movq %xmm0, (%rdi)
@@ -230,31 +218,27 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_usat_v4i64_v4i32:
@@ -262,54 +246,47 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm6, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    por %xmm1, %xmm3
-; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm5, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm1
 ; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm0
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_usat_v4i64_v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    por %xmm6, %xmm3
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm4, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT:    movdqa %xmm5, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSE41-NEXT:    pand %xmm6, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
 ; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4294967295,4294967295]
 ; SSE41-NEXT:    movapd {{.*#+}} xmm5 = [4294967295,429496729]
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
@@ -434,184 +411,165 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) {
 ; SSE2-LABEL: trunc_usat_v8i64_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm2
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm5
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm6
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    pxor %xmm0, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT:    movdqa %xmm4, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm8, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm7, %xmm8
-; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm8
-; SSE2-NEXT:    por %xmm1, %xmm8
+; SSE2-NEXT:    pxor %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm7
+; SSE2-NEXT:    pand %xmm9, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm7
+; SSE2-NEXT:    por %xmm1, %xmm7
 ; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSE2-NEXT:    movdqa %xmm4, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm9, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm6
 ; SSE2-NEXT:    pandn %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm6, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm0, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSE2-NEXT:    movdqa %xmm4, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm7
-; SSE2-NEXT:    pand %xmm7, %xmm5
-; SSE2-NEXT:    pandn %xmm3, %xmm7
-; SSE2-NEXT:    por %xmm5, %xmm7
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm0, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm5, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NEXT:    pandn %xmm3, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_usat_v8i64_v8i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa (%rdi), %xmm2
-; SSSE3-NEXT:    movdqa 16(%rdi), %xmm5
+; SSSE3-NEXT:    movdqa 16(%rdi), %xmm0
 ; SSSE3-NEXT:    movdqa 32(%rdi), %xmm6
 ; SSSE3-NEXT:    movdqa 48(%rdi), %xmm1
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm7
-; SSSE3-NEXT:    pxor %xmm0, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm8, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm7, %xmm8
-; SSSE3-NEXT:    pand %xmm8, %xmm1
-; SSSE3-NEXT:    pandn %xmm3, %xmm8
-; SSSE3-NEXT:    por %xmm1, %xmm8
+; SSSE3-NEXT:    pxor %xmm4, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm7
+; SSSE3-NEXT:    pand %xmm9, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm1
+; SSSE3-NEXT:    pandn %xmm3, %xmm7
+; SSSE3-NEXT:    por %xmm1, %xmm7
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm1
-; SSSE3-NEXT:    pxor %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSSE3-NEXT:    movdqa %xmm4, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm10, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm9, %xmm1
 ; SSSE3-NEXT:    pand %xmm1, %xmm6
 ; SSSE3-NEXT:    pandn %xmm3, %xmm1
 ; SSSE3-NEXT:    por %xmm6, %xmm1
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm0, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSSE3-NEXT:    movdqa %xmm4, %xmm8
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm8
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm7
-; SSSE3-NEXT:    pand %xmm7, %xmm5
-; SSSE3-NEXT:    pandn %xmm3, %xmm7
-; SSSE3-NEXT:    por %xmm5, %xmm7
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm0, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm6, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm6
+; SSSE3-NEXT:    pxor %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm6
+; SSSE3-NEXT:    pand %xmm8, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pandn %xmm3, %xmm6
+; SSSE3-NEXT:    por %xmm0, %xmm6
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSSE3-NEXT:    pand %xmm5, %xmm0
 ; SSSE3-NEXT:    pand %xmm0, %xmm2
 ; SSSE3-NEXT:    pandn %xmm3, %xmm0
 ; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_usat_v8i64_v8i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm3
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm6
-; SSE41-NEXT:    movdqa 32(%rdi), %xmm7
+; SSE41-NEXT:    movdqa (%rdi), %xmm4
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm7
+; SSE41-NEXT:    movdqa 32(%rdi), %xmm8
 ; SSE41-NEXT:    movdqa 48(%rdi), %xmm1
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT:    movdqa %xmm4, %xmm8
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT:    movdqa %xmm4, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE41-NEXT:    pand %xmm8, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm8
-; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm8
-; SSE41-NEXT:    movdqa %xmm7, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT:    movdqa %xmm5, %xmm9
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pand %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm4, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
-; SSE41-NEXT:    movdqa %xmm6, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT:    movdqa %xmm4, %xmm8
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
-; SSE41-NEXT:    pand %xmm7, %xmm0
-; SSE41-NEXT:    por %xmm8, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm7
-; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT:    pxor %xmm3, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
-; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm8
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm8
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm8
+; SSE41-NEXT:    pxor %xmm4, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[0,2]
+; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_usat_v8i64_v8i32:
@@ -722,18 +680,16 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991]
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    retq
 ;
@@ -741,18 +697,16 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    pxor %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991]
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm2
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT:    por %xmm0, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSSE3-NEXT:    retq
 ;
@@ -762,13 +716,11 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
 ; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65535,65535]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -845,18 +797,16 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991]
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    movd %xmm0, (%rdi)
 ; SSE2-NEXT:    retq
@@ -865,18 +815,16 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    pxor %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991]
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm2
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT:    por %xmm0, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSSE3-NEXT:    movd %xmm0, (%rdi)
 ; SSSE3-NEXT:    retq
@@ -887,13 +835,11 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
 ; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65535,65535]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -979,33 +925,29 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
 ; SSE2-NEXT:    movdqa %xmm6, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm0, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pand %xmm7, %xmm4
 ; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
@@ -1016,65 +958,60 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
 ; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm5, %xmm0
-; SSSE3-NEXT:    pandn %xmm2, %xmm5
-; SSSE3-NEXT:    por %xmm0, %xmm5
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
 ; SSSE3-NEXT:    pand %xmm4, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pandn %xmm2, %xmm3
-; SSSE3-NEXT:    por %xmm1, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSSE3-NEXT:    pandn %xmm2, %xmm4
+; SSSE3-NEXT:    por %xmm0, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_usat_v4i64_v4i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65535,65535]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [65535,65535]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT:    movdqa %xmm5, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    movdqa %xmm5, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm6
-; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm6
-; SSE41-NEXT:    pxor %xmm3, %xmm4
-; SSE41-NEXT:    movdqa %xmm5, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT:    pxor %xmm2, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    packusdw %xmm6, %xmm2
-; SSE41-NEXT:    packusdw %xmm2, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pand %xmm6, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    packusdw %xmm7, %xmm3
+; SSE41-NEXT:    packusdw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_usat_v4i64_v4i16:
@@ -1152,33 +1089,29 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
 ; SSE2-NEXT:    movdqa %xmm6, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm0, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pand %xmm7, %xmm4
 ; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movq %xmm1, (%rdi)
@@ -1190,33 +1123,29 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
 ; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm5, %xmm0
-; SSSE3-NEXT:    pandn %xmm2, %xmm5
-; SSSE3-NEXT:    por %xmm0, %xmm5
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
 ; SSSE3-NEXT:    pand %xmm4, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pandn %xmm2, %xmm3
-; SSSE3-NEXT:    por %xmm1, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSSE3-NEXT:    pandn %xmm2, %xmm4
+; SSSE3-NEXT:    por %xmm0, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSSE3-NEXT:    movq %xmm1, (%rdi)
@@ -1225,31 +1154,30 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
 ; SSE41-LABEL: trunc_usat_v4i64_v4i16_store:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [65535,65535]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [65535,65535]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT:    movdqa %xmm5, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    movdqa %xmm5, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm6
-; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm6
-; SSE41-NEXT:    pxor %xmm2, %xmm4
-; SSE41-NEXT:    movdqa %xmm5, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    movapd %xmm4, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT:    pxor %xmm2, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT:    packusdw %xmm6, %xmm3
-; SSE41-NEXT:    packusdw %xmm3, %xmm3
-; SSE41-NEXT:    movq %xmm3, (%rdi)
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm6, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    packusdw %xmm7, %xmm4
+; SSE41-NEXT:    packusdw %xmm4, %xmm4
+; SSE41-NEXT:    movq %xmm4, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_usat_v4i64_v4i16_store:
@@ -1328,204 +1256,185 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
 define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
 ; SSE2-LABEL: trunc_usat_v8i64_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm4
+; SSE2-NEXT:    movdqa (%rdi), %xmm5
 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm6
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    movdqa %xmm6, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
-; SSE2-NEXT:    movdqa %xmm5, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm8, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm2
-; SSE2-NEXT:    pand %xmm2, %xmm6
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT:    movdqa %xmm4, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm6, %xmm1
 ; SSE2-NEXT:    movdqa %xmm7, %xmm6
 ; SSE2-NEXT:    pxor %xmm3, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm8
-; SSE2-NEXT:    movdqa %xmm5, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm8, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm4, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE2-NEXT:    pand %xmm9, %xmm6
 ; SSE2-NEXT:    pand %xmm6, %xmm7
-; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
 ; SSE2-NEXT:    por %xmm7, %xmm6
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
 ; SSE2-NEXT:    pxor %xmm3, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm8
-; SSE2-NEXT:    movdqa %xmm5, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm8, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm7, %xmm8
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm8
-; SSE2-NEXT:    por %xmm4, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm4, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT:    pand %xmm9, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm7
+; SSE2-NEXT:    por %xmm5, %xmm7
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_usat_v8i64_v8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa (%rdi), %xmm4
+; SSSE3-NEXT:    movdqa (%rdi), %xmm5
 ; SSSE3-NEXT:    movdqa 16(%rdi), %xmm0
 ; SSSE3-NEXT:    movdqa 32(%rdi), %xmm6
 ; SSSE3-NEXT:    movdqa 48(%rdi), %xmm7
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT:    movdqa %xmm6, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm8
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
-; SSSE3-NEXT:    movdqa %xmm5, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm8, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm10, %xmm2
-; SSSE3-NEXT:    pand %xmm2, %xmm6
-; SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm6, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSSE3-NEXT:    pand %xmm9, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm6
+; SSSE3-NEXT:    pandn %xmm2, %xmm1
+; SSSE3-NEXT:    por %xmm6, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
 ; SSSE3-NEXT:    pxor %xmm3, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm8
-; SSSE3-NEXT:    movdqa %xmm5, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm8, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSSE3-NEXT:    pand %xmm9, %xmm6
 ; SSSE3-NEXT:    pand %xmm6, %xmm7
-; SSSE3-NEXT:    pandn %xmm1, %xmm6
+; SSSE3-NEXT:    pandn %xmm2, %xmm6
 ; SSSE3-NEXT:    por %xmm7, %xmm6
-; SSSE3-NEXT:    movdqa %xmm4, %xmm7
+; SSSE3-NEXT:    movdqa %xmm5, %xmm7
 ; SSSE3-NEXT:    pxor %xmm3, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm8
-; SSSE3-NEXT:    movdqa %xmm5, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm8, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm7, %xmm8
-; SSSE3-NEXT:    pand %xmm8, %xmm4
-; SSSE3-NEXT:    pandn %xmm1, %xmm8
-; SSSE3-NEXT:    por %xmm4, %xmm8
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
 ; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm4, %xmm0
-; SSSE3-NEXT:    pandn %xmm1, %xmm4
-; SSSE3-NEXT:    por %xmm0, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSSE3-NEXT:    pand %xmm9, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm5
+; SSSE3-NEXT:    pandn %xmm2, %xmm7
+; SSSE3-NEXT:    por %xmm5, %xmm7
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm3, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSSE3-NEXT:    pand %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    pandn %xmm2, %xmm5
+; SSSE3-NEXT:    por %xmm0, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3]
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_usat_v8i64_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm7
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    movdqa 32(%rdi), %xmm3
-; SSE41-NEXT:    movdqa 48(%rdi), %xmm6
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65535,65535]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa (%rdi), %xmm8
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE41-NEXT:    movdqa 32(%rdi), %xmm4
+; SSE41-NEXT:    movdqa 48(%rdi), %xmm7
+; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [65535,65535]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT:    movdqa %xmm5, %xmm9
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT:    movdqa %xmm4, %xmm8
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pand %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT:    packusdw %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm8
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT:    movdqa %xmm4, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
 ; SSE41-NEXT:    pand %xmm8, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm8
-; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm8
-; SSE41-NEXT:    movdqa %xmm7, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm4, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT:    packusdw %xmm8, %xmm1
-; SSE41-NEXT:    movdqa %xmm6, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT:    movdqa %xmm4, %xmm8
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
-; SSE41-NEXT:    pand %xmm7, %xmm0
-; SSE41-NEXT:    por %xmm8, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm7
-; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT:    pxor %xmm3, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    packusdw %xmm7, %xmm2
-; SSE41-NEXT:    packusdw %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm3, %xmm8
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm8
+; SSE41-NEXT:    pxor %xmm4, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm5, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    packusdw %xmm8, %xmm3
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_usat_v8i64_v8i16:
@@ -2035,17 +1944,15 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
@@ -2056,17 +1963,15 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    pxor %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm2
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    retq
 ;
@@ -2076,13 +1981,11 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
 ; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [255,255]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -2137,22 +2040,20 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    movw %ax, (%rdi)
 ; SSE2-NEXT:    retq
 ;
@@ -2160,19 +2061,17 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    pxor %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm2
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT:    por %xmm0, %xmm2
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    movd %xmm1, %eax
 ; SSSE3-NEXT:    movw %ax, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
@@ -2182,13 +2081,11 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
 ; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [255,255]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    pextrw $0, %xmm2, (%rdi)
@@ -2249,27 +2146,23 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
 ; SSE2-NEXT:    movdqa %xmm6, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pand %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm4
 ; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    pandn %xmm2, %xmm4
 ; SSE2-NEXT:    por %xmm1, %xmm4
@@ -2286,27 +2179,23 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
 ; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm5, %xmm0
-; SSSE3-NEXT:    pandn %xmm2, %xmm5
-; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pandn %xmm2, %xmm4
+; SSSE3-NEXT:    por %xmm4, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm6, %xmm4
 ; SSSE3-NEXT:    pand %xmm4, %xmm1
 ; SSSE3-NEXT:    pandn %xmm2, %xmm4
 ; SSSE3-NEXT:    por %xmm1, %xmm4
@@ -2319,32 +2208,31 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
 ; SSE41-LABEL: trunc_usat_v4i64_v4i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [255,255]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm6, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pand %xmm3, %xmm0
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    movapd %xmm4, %xmm3
-; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT:    pxor %xmm1, %xmm5
-; SSE41-NEXT:    movdqa %xmm6, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE41-NEXT:    movapd {{.*#+}} xmm5 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT:    movdqa %xmm7, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm5, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm1, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    por %xmm6, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm7, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm0, %xmm5
 ; SSE41-NEXT:    pshufb %xmm0, %xmm4
-; SSE41-NEXT:    pshufb %xmm0, %xmm3
-; SSE41-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_usat_v4i64_v4i8:
@@ -2426,33 +2314,29 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
 ; SSE2-NEXT:    movdqa %xmm6, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pand %xmm5, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm7, %xmm3
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    pandn %xmm2, %xmm3
 ; SSE2-NEXT:    por %xmm0, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    packuswb %xmm4, %xmm3
+; SSE2-NEXT:    packuswb %xmm0, %xmm3
 ; SSE2-NEXT:    packuswb %xmm3, %xmm3
 ; SSE2-NEXT:    packuswb %xmm3, %xmm3
 ; SSE2-NEXT:    movd %xmm3, (%rdi)
@@ -2460,70 +2344,65 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
 ;
 ; SSSE3-LABEL: trunc_usat_v4i64_v4i8_store:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,255]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm8
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm8, %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    pandn %xmm2, %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
+; SSSE3-NEXT:    pand %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pandn %xmm3, %xmm2
+; SSSE3-NEXT:    por %xmm0, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm4
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    pandn %xmm2, %xmm4
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT:    pshufb %xmm0, %xmm4
-; SSSE3-NEXT:    pshufb %xmm0, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSSE3-NEXT:    movd %xmm3, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm3, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movd %xmm2, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_usat_v4i64_v4i8_store:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [255,255]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT:    movdqa %xmm5, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    movdqa %xmm5, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm6
-; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
-; SSE41-NEXT:    pxor %xmm1, %xmm4
-; SSE41-NEXT:    movdqa %xmm5, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    movapd %xmm4, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm7
+; SSE41-NEXT:    pxor %xmm1, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    por %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm6, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSE41-NEXT:    pshufb %xmm0, %xmm3
-; SSE41-NEXT:    pshufb %xmm0, %xmm6
-; SSE41-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; SSE41-NEXT:    movd %xmm6, (%rdi)
+; SSE41-NEXT:    pshufb %xmm0, %xmm4
+; SSE41-NEXT:    pshufb %xmm0, %xmm7
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE41-NEXT:    movd %xmm7, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_usat_v4i64_v4i8_store:
@@ -2614,59 +2493,51 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
 ; SSE2-NEXT:    pxor %xmm3, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
 ; SSE2-NEXT:    movdqa %xmm4, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm8, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm7, %xmm8
-; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm8
-; SSE2-NEXT:    por %xmm0, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT:    pand %xmm9, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm7
+; SSE2-NEXT:    por %xmm0, %xmm7
 ; SSE2-NEXT:    movdqa %xmm6, %xmm0
 ; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm4, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm9, %xmm0
 ; SSE2-NEXT:    pand %xmm0, %xmm6
 ; SSE2-NEXT:    pandn %xmm2, %xmm0
 ; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    packuswb %xmm8, %xmm0
+; SSE2-NEXT:    packuswb %xmm7, %xmm0
 ; SSE2-NEXT:    movdqa %xmm5, %xmm6
 ; SSE2-NEXT:    pxor %xmm3, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm4, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm7
-; SSE2-NEXT:    pand %xmm7, %xmm5
-; SSE2-NEXT:    pandn %xmm2, %xmm7
-; SSE2-NEXT:    por %xmm5, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    por %xmm5, %xmm6
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    pxor %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    packuswb %xmm7, %xmm4
-; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm1, %xmm5
+; SSE2-NEXT:    packuswb %xmm6, %xmm5
+; SSE2-NEXT:    packuswb %xmm5, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -2680,117 +2551,106 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm7
 ; SSSE3-NEXT:    pxor %xmm3, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm8
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm8, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm7, %xmm8
-; SSSE3-NEXT:    pand %xmm8, %xmm0
-; SSSE3-NEXT:    pandn %xmm2, %xmm8
-; SSSE3-NEXT:    por %xmm0, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm7
+; SSSE3-NEXT:    pand %xmm9, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pandn %xmm2, %xmm7
+; SSSE3-NEXT:    por %xmm0, %xmm7
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm0
 ; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm10, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSSE3-NEXT:    pand %xmm9, %xmm0
 ; SSSE3-NEXT:    pand %xmm0, %xmm6
 ; SSSE3-NEXT:    pandn %xmm2, %xmm0
 ; SSSE3-NEXT:    por %xmm6, %xmm0
-; SSSE3-NEXT:    packuswb %xmm8, %xmm0
+; SSSE3-NEXT:    packuswb %xmm7, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm6
 ; SSSE3-NEXT:    pxor %xmm3, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm8
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm8
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm7
-; SSSE3-NEXT:    pand %xmm7, %xmm5
-; SSSE3-NEXT:    pandn %xmm2, %xmm7
-; SSSE3-NEXT:    por %xmm5, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSSE3-NEXT:    pand %xmm8, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm5
+; SSSE3-NEXT:    pandn %xmm2, %xmm6
+; SSSE3-NEXT:    por %xmm5, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
 ; SSSE3-NEXT:    pxor %xmm3, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm6, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    pandn %xmm2, %xmm4
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    packuswb %xmm7, %xmm4
-; SSSE3-NEXT:    packuswb %xmm4, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSSE3-NEXT:    pand %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm5
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    packuswb %xmm6, %xmm5
+; SSSE3-NEXT:    packuswb %xmm5, %xmm0
 ; SSSE3-NEXT:    packuswb %xmm0, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_usat_v8i64_v8i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm7
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    movdqa 32(%rdi), %xmm3
-; SSE41-NEXT:    movdqa 48(%rdi), %xmm6
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [255,255]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa (%rdi), %xmm8
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE41-NEXT:    movdqa 32(%rdi), %xmm4
+; SSE41-NEXT:    movdqa 48(%rdi), %xmm7
+; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT:    movdqa %xmm5, %xmm9
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT:    movdqa %xmm4, %xmm8
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pand %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT:    packusdw %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm8
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT:    movdqa %xmm4, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
 ; SSE41-NEXT:    pand %xmm8, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm8
-; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm8
-; SSE41-NEXT:    movdqa %xmm7, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm4, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT:    packusdw %xmm8, %xmm1
-; SSE41-NEXT:    movdqa %xmm6, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT:    movdqa %xmm4, %xmm8
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
-; SSE41-NEXT:    pand %xmm7, %xmm0
-; SSE41-NEXT:    por %xmm8, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm7
-; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT:    pxor %xmm3, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    packusdw %xmm7, %xmm2
-; SSE41-NEXT:    packusdw %xmm2, %xmm1
-; SSE41-NEXT:    packuswb %xmm1, %xmm1
+; SSE41-NEXT:    movapd %xmm3, %xmm8
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm8
+; SSE41-NEXT:    pxor %xmm4, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm5, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    packusdw %xmm8, %xmm3
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_usat_v8i64_v8i8:
@@ -2872,59 +2732,51 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm7
 ; SSE2-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm8, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm7, %xmm8
-; SSE2-NEXT:    pand %xmm8, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm8
-; SSE2-NEXT:    por %xmm5, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm7
+; SSE2-NEXT:    pand %xmm9, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm7
+; SSE2-NEXT:    por %xmm5, %xmm7
 ; SSE2-NEXT:    movdqa %xmm6, %xmm5
 ; SSE2-NEXT:    pxor %xmm2, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm9, %xmm5
 ; SSE2-NEXT:    pand %xmm5, %xmm6
 ; SSE2-NEXT:    pandn %xmm1, %xmm5
 ; SSE2-NEXT:    por %xmm6, %xmm5
-; SSE2-NEXT:    packuswb %xmm8, %xmm5
+; SSE2-NEXT:    packuswb %xmm7, %xmm5
 ; SSE2-NEXT:    movdqa %xmm4, %xmm6
 ; SSE2-NEXT:    pxor %xmm2, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
-; SSE2-NEXT:    pand %xmm7, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm7
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm7
-; SSE2-NEXT:    por %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    por %xmm4, %xmm6
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    packuswb %xmm7, %xmm3
-; SSE2-NEXT:    packuswb %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    packuswb %xmm6, %xmm4
+; SSE2-NEXT:    packuswb %xmm4, %xmm5
 ; SSE2-NEXT:    packuswb %xmm5, %xmm5
 ; SSE2-NEXT:    movq %xmm5, (%rsi)
 ; SSE2-NEXT:    retq
@@ -2939,118 +2791,107 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm7
 ; SSSE3-NEXT:    pxor %xmm2, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm8
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm8, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm7, %xmm8
-; SSSE3-NEXT:    pand %xmm8, %xmm5
-; SSSE3-NEXT:    pandn %xmm1, %xmm8
-; SSSE3-NEXT:    por %xmm5, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm7
+; SSSE3-NEXT:    pand %xmm9, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm5
+; SSSE3-NEXT:    pandn %xmm1, %xmm7
+; SSSE3-NEXT:    por %xmm5, %xmm7
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm5
 ; SSSE3-NEXT:    pxor %xmm2, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm10, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSSE3-NEXT:    pand %xmm9, %xmm5
 ; SSSE3-NEXT:    pand %xmm5, %xmm6
 ; SSSE3-NEXT:    pandn %xmm1, %xmm5
 ; SSSE3-NEXT:    por %xmm6, %xmm5
-; SSSE3-NEXT:    packuswb %xmm8, %xmm5
+; SSSE3-NEXT:    packuswb %xmm7, %xmm5
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm6
 ; SSSE3-NEXT:    pxor %xmm2, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm8
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm8
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm7
-; SSSE3-NEXT:    pand %xmm7, %xmm4
-; SSSE3-NEXT:    pandn %xmm1, %xmm7
-; SSSE3-NEXT:    por %xmm4, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSSE3-NEXT:    pand %xmm8, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm4
+; SSSE3-NEXT:    pandn %xmm1, %xmm6
+; SSSE3-NEXT:    por %xmm4, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
 ; SSSE3-NEXT:    pxor %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm6, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    pandn %xmm1, %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    packuswb %xmm7, %xmm3
-; SSSE3-NEXT:    packuswb %xmm3, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pandn %xmm1, %xmm4
+; SSSE3-NEXT:    por %xmm0, %xmm4
+; SSSE3-NEXT:    packuswb %xmm6, %xmm4
+; SSSE3-NEXT:    packuswb %xmm4, %xmm5
 ; SSSE3-NEXT:    packuswb %xmm5, %xmm5
 ; SSSE3-NEXT:    movq %xmm5, (%rsi)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_usat_v8i64_v8i8_store:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm7
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm6
-; SSE41-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE41-NEXT:    movdqa 48(%rdi), %xmm5
-; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [255,255]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa (%rdi), %xmm8
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm7
+; SSE41-NEXT:    movdqa 32(%rdi), %xmm3
+; SSE41-NEXT:    movdqa 48(%rdi), %xmm6
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT:    movdqa %xmm4, %xmm9
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pand %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm9
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
+; SSE41-NEXT:    packusdw %xmm9, %xmm7
 ; SSE41-NEXT:    movdqa %xmm6, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT:    movdqa %xmm3, %xmm8
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm8
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT:    movdqa %xmm3, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
 ; SSE41-NEXT:    pand %xmm8, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm1, %xmm8
+; SSE41-NEXT:    movapd %xmm2, %xmm8
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm8
-; SSE41-NEXT:    movdqa %xmm7, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    movdqa %xmm3, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm1, %xmm6
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm6
-; SSE41-NEXT:    packusdw %xmm8, %xmm6
-; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT:    movdqa %xmm3, %xmm8
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
-; SSE41-NEXT:    pand %xmm7, %xmm0
-; SSE41-NEXT:    por %xmm8, %xmm0
-; SSE41-NEXT:    movapd %xmm1, %xmm7
-; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
-; SSE41-NEXT:    pxor %xmm2, %xmm4
-; SSE41-NEXT:    movdqa %xmm3, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    packusdw %xmm7, %xmm1
-; SSE41-NEXT:    packusdw %xmm1, %xmm6
-; SSE41-NEXT:    packuswb %xmm6, %xmm6
-; SSE41-NEXT:    movq %xmm6, (%rsi)
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    packusdw %xmm8, %xmm2
+; SSE41-NEXT:    packusdw %xmm2, %xmm7
+; SSE41-NEXT:    packuswb %xmm7, %xmm7
+; SSE41-NEXT:    movq %xmm7, (%rsi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_usat_v8i64_v8i8_store:
@@ -3128,7 +2969,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) {
 ; SSE2-LABEL: trunc_usat_v16i64_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa 96(%rdi), %xmm1
-; SSE2-NEXT:    movdqa 112(%rdi), %xmm3
+; SSE2-NEXT:    movdqa 112(%rdi), %xmm4
 ; SSE2-NEXT:    movdqa 64(%rdi), %xmm6
 ; SSE2-NEXT:    movdqa 80(%rdi), %xmm7
 ; SSE2-NEXT:    movdqa (%rdi), %xmm10
@@ -3136,116 +2977,100 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) {
 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm8
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm11
-; SSE2-NEXT:    pxor %xmm4, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm12
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    pxor %xmm3, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm13
-; SSE2-NEXT:    pcmpgtd %xmm11, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[0,0,2,2]
-; SSE2-NEXT:    pand %xmm12, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSE2-NEXT:    por %xmm11, %xmm12
-; SSE2-NEXT:    pand %xmm12, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm12
-; SSE2-NEXT:    por %xmm0, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm11
+; SSE2-NEXT:    pand %xmm13, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm11
+; SSE2-NEXT:    por %xmm0, %xmm11
 ; SSE2-NEXT:    movdqa %xmm10, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm13
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSE2-NEXT:    pand %xmm11, %xmm14
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[1,1,3,3]
-; SSE2-NEXT:    por %xmm14, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm13, %xmm0
 ; SSE2-NEXT:    pand %xmm0, %xmm10
 ; SSE2-NEXT:    pandn %xmm2, %xmm0
 ; SSE2-NEXT:    por %xmm10, %xmm0
-; SSE2-NEXT:    packuswb %xmm12, %xmm0
+; SSE2-NEXT:    packuswb %xmm11, %xmm0
 ; SSE2-NEXT:    movdqa %xmm9, %xmm10
-; SSE2-NEXT:    pxor %xmm4, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm11
-; SSE2-NEXT:    movdqa %xmm5, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2]
-; SSE2-NEXT:    pand %xmm11, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm10, %xmm11
-; SSE2-NEXT:    pand %xmm11, %xmm9
-; SSE2-NEXT:    pandn %xmm2, %xmm11
-; SSE2-NEXT:    por %xmm9, %xmm11
-; SSE2-NEXT:    movdqa %xmm8, %xmm9
-; SSE2-NEXT:    pxor %xmm4, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm10
+; SSE2-NEXT:    pxor %xmm3, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm10
+; SSE2-NEXT:    pand %xmm12, %xmm10
 ; SSE2-NEXT:    pand %xmm10, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
-; SSE2-NEXT:    por %xmm9, %xmm10
-; SSE2-NEXT:    pand %xmm10, %xmm8
 ; SSE2-NEXT:    pandn %xmm2, %xmm10
-; SSE2-NEXT:    por %xmm8, %xmm10
-; SSE2-NEXT:    packuswb %xmm11, %xmm10
-; SSE2-NEXT:    packuswb %xmm10, %xmm0
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    pxor %xmm4, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm9
-; SSE2-NEXT:    movdqa %xmm5, %xmm10
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
+; SSE2-NEXT:    por %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm3, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm9
+; SSE2-NEXT:    pand %xmm12, %xmm9
 ; SSE2-NEXT:    pand %xmm9, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT:    por %xmm8, %xmm9
-; SSE2-NEXT:    pand %xmm9, %xmm7
 ; SSE2-NEXT:    pandn %xmm2, %xmm9
-; SSE2-NEXT:    por %xmm7, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    packuswb %xmm10, %xmm9
+; SSE2-NEXT:    packuswb %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm3, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm8
+; SSE2-NEXT:    pand %xmm10, %xmm8
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pandn %xmm2, %xmm8
+; SSE2-NEXT:    por %xmm7, %xmm8
 ; SSE2-NEXT:    movdqa %xmm6, %xmm7
-; SSE2-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm8
+; SSE2-NEXT:    pxor %xmm3, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm10
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT:    pand %xmm8, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3]
-; SSE2-NEXT:    por %xmm11, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT:    pand %xmm10, %xmm7
 ; SSE2-NEXT:    pand %xmm7, %xmm6
 ; SSE2-NEXT:    pandn %xmm2, %xmm7
 ; SSE2-NEXT:    por %xmm6, %xmm7
-; SSE2-NEXT:    packuswb %xmm9, %xmm7
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm8
+; SSE2-NEXT:    packuswb %xmm8, %xmm7
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2]
-; SSE2-NEXT:    pand %xmm8, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm8
-; SSE2-NEXT:    pand %xmm8, %xmm3
-; SSE2-NEXT:    pandn %xmm2, %xmm8
-; SSE2-NEXT:    por %xmm3, %xmm8
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    por %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm5, %xmm4
 ; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    pandn %xmm2, %xmm4
 ; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    packuswb %xmm8, %xmm4
+; SSE2-NEXT:    packuswb %xmm6, %xmm4
 ; SSE2-NEXT:    packuswb %xmm4, %xmm7
 ; SSE2-NEXT:    packuswb %xmm7, %xmm0
 ; SSE2-NEXT:    retq
@@ -3253,7 +3078,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) {
 ; SSSE3-LABEL: trunc_usat_v16i64_v16i8:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa 96(%rdi), %xmm1
-; SSSE3-NEXT:    movdqa 112(%rdi), %xmm3
+; SSSE3-NEXT:    movdqa 112(%rdi), %xmm4
 ; SSSE3-NEXT:    movdqa 64(%rdi), %xmm6
 ; SSSE3-NEXT:    movdqa 80(%rdi), %xmm7
 ; SSSE3-NEXT:    movdqa (%rdi), %xmm10
@@ -3261,226 +3086,203 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) {
 ; SSSE3-NEXT:    movdqa 32(%rdi), %xmm8
 ; SSSE3-NEXT:    movdqa 48(%rdi), %xmm9
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm11
-; SSSE3-NEXT:    pxor %xmm4, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm12
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT:    pxor %xmm3, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm13
-; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm12, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSSE3-NEXT:    por %xmm11, %xmm12
-; SSSE3-NEXT:    pand %xmm12, %xmm0
-; SSSE3-NEXT:    pandn %xmm2, %xmm12
-; SSSE3-NEXT:    por %xmm0, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm13
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm11
+; SSSE3-NEXT:    pand %xmm13, %xmm11
+; SSSE3-NEXT:    pand %xmm11, %xmm0
+; SSSE3-NEXT:    pandn %xmm2, %xmm11
+; SSSE3-NEXT:    por %xmm0, %xmm11
 ; SSSE3-NEXT:    movdqa %xmm10, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm11
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm13
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm11, %xmm14
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[1,1,3,3]
-; SSSE3-NEXT:    por %xmm14, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm13
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSSE3-NEXT:    pand %xmm13, %xmm0
 ; SSSE3-NEXT:    pand %xmm0, %xmm10
 ; SSSE3-NEXT:    pandn %xmm2, %xmm0
 ; SSSE3-NEXT:    por %xmm10, %xmm0
-; SSSE3-NEXT:    packuswb %xmm12, %xmm0
+; SSSE3-NEXT:    packuswb %xmm11, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm9, %xmm10
-; SSSE3-NEXT:    pxor %xmm4, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm11
-; SSSE3-NEXT:    movdqa %xmm5, %xmm12
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm12
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm11, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
-; SSSE3-NEXT:    por %xmm10, %xmm11
-; SSSE3-NEXT:    pand %xmm11, %xmm9
-; SSSE3-NEXT:    pandn %xmm2, %xmm11
-; SSSE3-NEXT:    por %xmm9, %xmm11
-; SSSE3-NEXT:    movdqa %xmm8, %xmm9
-; SSSE3-NEXT:    pxor %xmm4, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm10
+; SSSE3-NEXT:    pxor %xmm3, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm12
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm12
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm10
+; SSSE3-NEXT:    pand %xmm12, %xmm10
 ; SSSE3-NEXT:    pand %xmm10, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
-; SSSE3-NEXT:    por %xmm9, %xmm10
-; SSSE3-NEXT:    pand %xmm10, %xmm8
 ; SSSE3-NEXT:    pandn %xmm2, %xmm10
-; SSSE3-NEXT:    por %xmm8, %xmm10
-; SSSE3-NEXT:    packuswb %xmm11, %xmm10
-; SSSE3-NEXT:    packuswb %xmm10, %xmm0
-; SSSE3-NEXT:    movdqa %xmm7, %xmm8
-; SSSE3-NEXT:    pxor %xmm4, %xmm8
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm9
-; SSSE3-NEXT:    movdqa %xmm5, %xmm10
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
+; SSSE3-NEXT:    por %xmm9, %xmm10
+; SSSE3-NEXT:    movdqa %xmm8, %xmm9
+; SSSE3-NEXT:    pxor %xmm3, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm9
+; SSSE3-NEXT:    pand %xmm12, %xmm9
 ; SSSE3-NEXT:    pand %xmm9, %xmm8
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSSE3-NEXT:    por %xmm8, %xmm9
-; SSSE3-NEXT:    pand %xmm9, %xmm7
 ; SSSE3-NEXT:    pandn %xmm2, %xmm9
-; SSSE3-NEXT:    por %xmm7, %xmm9
+; SSSE3-NEXT:    por %xmm8, %xmm9
+; SSSE3-NEXT:    packuswb %xmm10, %xmm9
+; SSSE3-NEXT:    packuswb %xmm9, %xmm0
+; SSSE3-NEXT:    movdqa %xmm7, %xmm8
+; SSSE3-NEXT:    pxor %xmm3, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm10
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm8
+; SSSE3-NEXT:    pand %xmm10, %xmm8
+; SSSE3-NEXT:    pand %xmm8, %xmm7
+; SSSE3-NEXT:    pandn %xmm2, %xmm8
+; SSSE3-NEXT:    por %xmm7, %xmm8
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
-; SSSE3-NEXT:    pxor %xmm4, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm8
+; SSSE3-NEXT:    pxor %xmm3, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm10
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm8, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3]
-; SSSE3-NEXT:    por %xmm11, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm7
+; SSSE3-NEXT:    pand %xmm10, %xmm7
 ; SSSE3-NEXT:    pand %xmm7, %xmm6
 ; SSSE3-NEXT:    pandn %xmm2, %xmm7
 ; SSSE3-NEXT:    por %xmm6, %xmm7
-; SSSE3-NEXT:    packuswb %xmm9, %xmm7
-; SSSE3-NEXT:    movdqa %xmm3, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm8
+; SSSE3-NEXT:    packuswb %xmm8, %xmm7
+; SSSE3-NEXT:    movdqa %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm8, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm8
-; SSSE3-NEXT:    pand %xmm8, %xmm3
-; SSSE3-NEXT:    pandn %xmm2, %xmm8
-; SSSE3-NEXT:    por %xmm3, %xmm8
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pand %xmm6, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSSE3-NEXT:    pand %xmm9, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm4
+; SSSE3-NEXT:    pandn %xmm2, %xmm6
+; SSSE3-NEXT:    por %xmm4, %xmm6
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm5, %xmm4
 ; SSSE3-NEXT:    pand %xmm4, %xmm1
 ; SSSE3-NEXT:    pandn %xmm2, %xmm4
 ; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    packuswb %xmm8, %xmm4
+; SSSE3-NEXT:    packuswb %xmm6, %xmm4
 ; SSSE3-NEXT:    packuswb %xmm4, %xmm7
 ; SSSE3-NEXT:    packuswb %xmm7, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_usat_v16i64_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa 96(%rdi), %xmm2
-; SSE41-NEXT:    movdqa 112(%rdi), %xmm4
-; SSE41-NEXT:    movdqa 64(%rdi), %xmm7
-; SSE41-NEXT:    movdqa 80(%rdi), %xmm8
-; SSE41-NEXT:    movdqa (%rdi), %xmm11
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    movdqa 32(%rdi), %xmm9
-; SSE41-NEXT:    movdqa 48(%rdi), %xmm10
-; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [255,255]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa 96(%rdi), %xmm3
+; SSE41-NEXT:    movdqa 112(%rdi), %xmm5
+; SSE41-NEXT:    movdqa 64(%rdi), %xmm8
+; SSE41-NEXT:    movdqa 80(%rdi), %xmm9
+; SSE41-NEXT:    movdqa (%rdi), %xmm12
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE41-NEXT:    movdqa 32(%rdi), %xmm10
+; SSE41-NEXT:    movdqa 48(%rdi), %xmm11
+; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT:    movdqa %xmm6, %xmm13
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm13
+; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT:    movdqa %xmm5, %xmm12
+; SSE41-NEXT:    pcmpgtd %xmm14, %xmm0
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    movapd %xmm4, %xmm13
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm13
+; SSE41-NEXT:    movdqa %xmm12, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm14, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm4, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm2
+; SSE41-NEXT:    packusdw %xmm13, %xmm2
+; SSE41-NEXT:    movdqa %xmm11, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm12
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm12
-; SSE41-NEXT:    movdqa %xmm5, %xmm13
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm13
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm13, %xmm0
 ; SSE41-NEXT:    pand %xmm12, %xmm0
-; SSE41-NEXT:    por %xmm13, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm12
-; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm12
-; SSE41-NEXT:    movdqa %xmm11, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm5, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm5, %xmm13
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm13
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm13, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm1
-; SSE41-NEXT:    packusdw %xmm12, %xmm1
+; SSE41-NEXT:    movapd %xmm4, %xmm12
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm12
 ; SSE41-NEXT:    movdqa %xmm10, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm5, %xmm11
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm11
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm11
-; SSE41-NEXT:    movdqa %xmm5, %xmm12
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm12
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm13, %xmm0
 ; SSE41-NEXT:    pand %xmm11, %xmm0
-; SSE41-NEXT:    por %xmm12, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm11
+; SSE41-NEXT:    movapd %xmm4, %xmm11
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm11
+; SSE41-NEXT:    packusdw %xmm12, %xmm11
+; SSE41-NEXT:    packusdw %xmm11, %xmm2
 ; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm5, %xmm10
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm10
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT:    movdqa %xmm5, %xmm12
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm12
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm0
 ; SSE41-NEXT:    pand %xmm10, %xmm0
-; SSE41-NEXT:    por %xmm12, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm10
+; SSE41-NEXT:    movapd %xmm4, %xmm10
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm10
-; SSE41-NEXT:    packusdw %xmm11, %xmm10
-; SSE41-NEXT:    packusdw %xmm10, %xmm1
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm5, %xmm9
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm9
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT:    movdqa %xmm5, %xmm10
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm0
 ; SSE41-NEXT:    pand %xmm9, %xmm0
-; SSE41-NEXT:    por %xmm10, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm9
+; SSE41-NEXT:    movapd %xmm4, %xmm9
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm9
-; SSE41-NEXT:    movdqa %xmm7, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm5, %xmm8
+; SSE41-NEXT:    packusdw %xmm10, %xmm9
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm8
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT:    movdqa %xmm5, %xmm10
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
 ; SSE41-NEXT:    pand %xmm8, %xmm0
-; SSE41-NEXT:    por %xmm10, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm8
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm8
-; SSE41-NEXT:    packusdw %xmm9, %xmm8
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm5, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT:    movdqa %xmm5, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE41-NEXT:    pand %xmm7, %xmm0
-; SSE41-NEXT:    por %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm7
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm7
-; SSE41-NEXT:    pxor %xmm2, %xmm6
-; SSE41-NEXT:    movdqa %xmm5, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm6, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    por %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT:    packusdw %xmm7, %xmm3
-; SSE41-NEXT:    packusdw %xmm3, %xmm8
-; SSE41-NEXT:    packuswb %xmm8, %xmm1
+; SSE41-NEXT:    movapd %xmm4, %xmm8
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm8
+; SSE41-NEXT:    pxor %xmm3, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm6, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    packusdw %xmm8, %xmm4
+; SSE41-NEXT:    packusdw %xmm4, %xmm9
+; SSE41-NEXT:    packuswb %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_usat_v16i64_v16i8:


        


More information about the llvm-commits mailing list