[llvm] r373187 - [TargetLowering] Simplify expansion of S{ADD,SUB}O

Roger Ferrer Ibanez via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 30 00:58:51 PDT 2019


Author: rogfer01
Date: Mon Sep 30 00:58:50 2019
New Revision: 373187

URL: http://llvm.org/viewvc/llvm-project?rev=373187&view=rev
Log:
[TargetLowering] Simplify expansion of S{ADD,SUB}O

ISD::SADDO uses the suggested sequence described in the section ยง2.4 of
the RISCV Spec v2.2. ISD::SSUBO uses the dual approach but checking for
(non-zero) positive.

Differential Revision: https://reviews.llvm.org/D47927

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll
    llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll
    llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll
    llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll
    llvm/trunk/test/CodeGen/AMDGPU/saddo.ll
    llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll
    llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll
    llvm/trunk/test/CodeGen/X86/combine-mulo.ll
    llvm/trunk/test/CodeGen/X86/mulo-pow2.ll
    llvm/trunk/test/CodeGen/X86/sadd_sat.ll
    llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll
    llvm/trunk/test/CodeGen/X86/ssub_sat.ll
    llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll
    llvm/trunk/test/CodeGen/X86/vec_saddo.ll
    llvm/trunk/test/CodeGen/X86/vec_ssubo.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Mon Sep 30 00:58:50 2019
@@ -6907,24 +6907,19 @@ void TargetLowering::expandSADDSUBO(
 
   SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
 
-  //   LHSSign -> LHS >= 0
-  //   RHSSign -> RHS >= 0
-  //   SumSign -> Result >= 0
-  //
-  //   Add:
-  //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
-  //   Sub:
-  //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
-  SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
-  SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
-  SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
-                                    IsAdd ? ISD::SETEQ : ISD::SETNE);
+  // For an addition, the result should be less than one of the operands (LHS)
+  // if and only if the other operand (RHS) is negative, otherwise there will
+  // be overflow.
+  // For a subtraction, the result should be less than one of the operands
+  // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
+  // otherwise there will be overflow.
+  SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
+  SDValue ConditionRHS =
+      DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT);
 
-  SDValue SumSign = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETGE);
-  SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
-
-  SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
-  Overflow = DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType);
+  Overflow = DAG.getBoolExtOrTrunc(
+      DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
+      ResultType, ResultType);
 }
 
 bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,

Modified: llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll Mon Sep 30 00:58:50 2019
@@ -54,17 +54,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x
 ; CHECK-LABEL: vec:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmge v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
-; CHECK-NEXT:    cmge v5.4s, v2.4s, #0
 ; CHECK-NEXT:    cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT:    cmeq v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v5.4s
 ; CHECK-NEXT:    mvni v3.4s, #128, lsl #24
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);

Modified: llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll Mon Sep 30 00:58:50 2019
@@ -36,17 +36,13 @@ define <16 x i8> @v16i8(<16 x i8> %x, <1
 ; CHECK-LABEL: v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v2.16b, #0
 ; CHECK-NEXT:    cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT:    cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    movi v3.16b, #127
+; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
@@ -57,29 +53,21 @@ define <32 x i8> @v32i8(<32 x i8> %x, <3
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v4.16b, v0.16b, v2.16b
-; CHECK-NEXT:    cmlt v16.16b, v4.16b, #0
+; CHECK-NEXT:    cmlt v7.16b, v4.16b, #0
 ; CHECK-NEXT:    movi v6.16b, #127
+; CHECK-NEXT:    mvn v16.16b, v7.16b
+; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
 ; CHECK-NEXT:    add v7.16b, v1.16b, v3.16b
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT:    cmlt v2.16b, v2.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v4.16b
 ; CHECK-NEXT:    cmlt v16.16b, v7.16b, #0
 ; CHECK-NEXT:    movi v5.16b, #127
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT:    cmge v2.16b, v2.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v16.16b, v4.16b, #0
-; CHECK-NEXT:    cmge v3.16b, v3.16b, #0
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmeq v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    cmge v16.16b, v7.16b, #0
-; CHECK-NEXT:    cmeq v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    cmeq v1.16b, v1.16b, v16.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    cmlt v3.16b, v3.16b, #0
+; CHECK-NEXT:    cmgt v1.16b, v1.16b, v7.16b
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v16.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
 ; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
 ; CHECK-NEXT:    ret
@@ -102,42 +90,26 @@ define <64 x i8> @v64i8(<64 x i8> %x, <6
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
 ; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.16b, v21.16b, #0
+; CHECK-NEXT:    cmlt v4.16b, v4.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v16.16b
 ; CHECK-NEXT:    movi v22.16b, #127
 ; CHECK-NEXT:    add v23.16b, v3.16b, v7.16b
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    cmlt v4.16b, v5.16b, #0
+; CHECK-NEXT:    cmgt v1.16b, v1.16b, v19.16b
 ; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.16b, v23.16b, #0
+; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    cmlt v4.16b, v6.16b, #0
+; CHECK-NEXT:    cmgt v2.16b, v2.16b, v21.16b
 ; CHECK-NEXT:    movi v17.16b, #127
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    cmlt v4.16b, v7.16b, #0
+; CHECK-NEXT:    cmgt v3.16b, v3.16b, v23.16b
 ; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmge v4.16b, v4.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v24.16b, v16.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v5.16b, #0
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmeq v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v24.16b
-; CHECK-NEXT:    cmge v24.16b, v19.16b, #0
-; CHECK-NEXT:    cmge v6.16b, v6.16b, #0
-; CHECK-NEXT:    cmge v2.16b, v2.16b, #0
-; CHECK-NEXT:    cmeq v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    cmeq v1.16b, v1.16b, v24.16b
-; CHECK-NEXT:    cmge v24.16b, v21.16b, #0
-; CHECK-NEXT:    cmge v7.16b, v7.16b, #0
-; CHECK-NEXT:    cmge v3.16b, v3.16b, #0
-; CHECK-NEXT:    cmeq v6.16b, v2.16b, v6.16b
-; CHECK-NEXT:    cmeq v2.16b, v2.16b, v24.16b
-; CHECK-NEXT:    cmge v24.16b, v23.16b, #0
-; CHECK-NEXT:    cmeq v7.16b, v3.16b, v7.16b
-; CHECK-NEXT:    cmeq v3.16b, v3.16b, v24.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v2.16b, v6.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v7.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
 ; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
 ; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
@@ -151,17 +123,13 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8
 ; CHECK-LABEL: v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h
-; CHECK-NEXT:    cmge v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmge v0.8h, v0.8h, #0
-; CHECK-NEXT:    cmge v5.8h, v2.8h, #0
 ; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT:    cmeq v1.8h, v0.8h, v1.8h
-; CHECK-NEXT:    cmeq v0.8h, v0.8h, v5.8h
 ; CHECK-NEXT:    mvni v3.8h, #128, lsl #8
+; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT:    cmgt v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
@@ -172,29 +140,21 @@ define <16 x i16> @v16i16(<16 x i16> %x,
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v4.8h, v0.8h, v2.8h
-; CHECK-NEXT:    cmlt v16.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v4.8h, #0
 ; CHECK-NEXT:    mvni v6.8h, #128, lsl #8
+; CHECK-NEXT:    mvn v16.16b, v7.16b
+; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
 ; CHECK-NEXT:    add v7.8h, v1.8h, v3.8h
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT:    cmgt v0.8h, v0.8h, v4.8h
 ; CHECK-NEXT:    cmlt v16.8h, v7.8h, #0
 ; CHECK-NEXT:    mvni v5.8h, #128, lsl #8
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT:    cmge v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmge v0.8h, v0.8h, #0
-; CHECK-NEXT:    cmge v16.8h, v4.8h, #0
-; CHECK-NEXT:    cmge v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmge v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmeq v2.8h, v0.8h, v2.8h
-; CHECK-NEXT:    cmeq v0.8h, v0.8h, v16.8h
-; CHECK-NEXT:    cmge v16.8h, v7.8h, #0
-; CHECK-NEXT:    cmeq v3.8h, v1.8h, v3.8h
-; CHECK-NEXT:    cmeq v1.8h, v1.8h, v16.8h
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    cmlt v3.8h, v3.8h, #0
+; CHECK-NEXT:    cmgt v1.8h, v1.8h, v7.8h
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v16.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
 ; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
 ; CHECK-NEXT:    ret
@@ -217,42 +177,26 @@ define <32 x i16> @v32i16(<32 x i16> %x,
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
 ; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.8h, v21.8h, #0
+; CHECK-NEXT:    cmlt v4.8h, v4.8h, #0
+; CHECK-NEXT:    cmgt v0.8h, v0.8h, v16.8h
 ; CHECK-NEXT:    mvni v22.8h, #128, lsl #8
 ; CHECK-NEXT:    add v23.8h, v3.8h, v7.8h
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    cmlt v4.8h, v5.8h, #0
+; CHECK-NEXT:    cmgt v1.8h, v1.8h, v19.8h
 ; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.8h, v23.8h, #0
+; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    cmlt v4.8h, v6.8h, #0
+; CHECK-NEXT:    cmgt v2.8h, v2.8h, v21.8h
 ; CHECK-NEXT:    mvni v17.8h, #128, lsl #8
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    cmlt v4.8h, v7.8h, #0
+; CHECK-NEXT:    cmgt v3.8h, v3.8h, v23.8h
 ; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmge v4.8h, v4.8h, #0
-; CHECK-NEXT:    cmge v0.8h, v0.8h, #0
-; CHECK-NEXT:    cmge v24.8h, v16.8h, #0
-; CHECK-NEXT:    cmge v5.8h, v5.8h, #0
-; CHECK-NEXT:    cmge v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmeq v4.8h, v0.8h, v4.8h
-; CHECK-NEXT:    cmeq v0.8h, v0.8h, v24.8h
-; CHECK-NEXT:    cmge v24.8h, v19.8h, #0
-; CHECK-NEXT:    cmge v6.8h, v6.8h, #0
-; CHECK-NEXT:    cmge v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmeq v5.8h, v1.8h, v5.8h
-; CHECK-NEXT:    cmeq v1.8h, v1.8h, v24.8h
-; CHECK-NEXT:    cmge v24.8h, v21.8h, #0
-; CHECK-NEXT:    cmge v7.8h, v7.8h, #0
-; CHECK-NEXT:    cmge v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmeq v6.8h, v2.8h, v6.8h
-; CHECK-NEXT:    cmeq v2.8h, v2.8h, v24.8h
-; CHECK-NEXT:    cmge v24.8h, v23.8h, #0
-; CHECK-NEXT:    cmeq v7.8h, v3.8h, v7.8h
-; CHECK-NEXT:    cmeq v3.8h, v3.8h, v24.8h
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v2.16b, v6.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v7.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
 ; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
 ; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
@@ -269,16 +213,12 @@ define void @v8i8(<8 x i8>* %px, <8 x i8
 ; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    movi v2.8b, #127
 ; CHECK-NEXT:    add v3.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmge v1.8b, v1.8b, #0
-; CHECK-NEXT:    cmge v0.8b, v0.8b, #0
-; CHECK-NEXT:    cmge v5.8b, v3.8b, #0
 ; CHECK-NEXT:    cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT:    cmeq v1.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmeq v0.8b, v0.8b, v5.8b
+; CHECK-NEXT:    cmlt v1.8b, v1.8b, #0
+; CHECK-NEXT:    cmgt v0.8b, v0.8b, v3.8b
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
@@ -311,17 +251,13 @@ define void @v4i8(<4 x i8>* %px, <4 x i8
 ; CHECK-NEXT:    shl v1.4h, v1.4h, #8
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    add v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmge v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmge v0.4h, v0.4h, #0
-; CHECK-NEXT:    cmge v5.4h, v3.4h, #0
 ; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmeq v0.4h, v0.4h, v5.4h
 ; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-NEXT:    cmlt v1.4h, v1.4h, #0
+; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
@@ -348,17 +284,13 @@ define void @v2i8(<2 x i8>* %px, <2 x i8
 ; CHECK-NEXT:    shl v2.2s, v2.2s, #24
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    add v3.2s, v0.2s, v2.2s
-; CHECK-NEXT:    cmge v2.2s, v2.2s, #0
-; CHECK-NEXT:    cmge v0.2s, v0.2s, #0
-; CHECK-NEXT:    cmge v5.2s, v3.2s, #0
 ; CHECK-NEXT:    cmlt v4.2s, v3.2s, #0
-; CHECK-NEXT:    cmeq v2.2s, v0.2s, v2.2s
-; CHECK-NEXT:    cmeq v0.2s, v0.2s, v5.2s
 ; CHECK-NEXT:    mvni v1.2s, #128, lsl #24
+; CHECK-NEXT:    cmlt v2.2s, v2.2s, #0
+; CHECK-NEXT:    cmgt v0.2s, v0.2s, v3.2s
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    bsl v1.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v1.8b, v3.8b
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
@@ -380,16 +312,12 @@ define void @v4i16(<4 x i16>* %px, <4 x
 ; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-NEXT:    add v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmge v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmge v0.4h, v0.4h, #0
-; CHECK-NEXT:    cmge v5.4h, v3.4h, #0
 ; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmeq v0.4h, v0.4h, v5.4h
+; CHECK-NEXT:    cmlt v1.4h, v1.4h, #0
+; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
@@ -414,17 +342,13 @@ define void @v2i16(<2 x i16>* %px, <2 x
 ; CHECK-NEXT:    shl v2.2s, v2.2s, #16
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-NEXT:    add v3.2s, v0.2s, v2.2s
-; CHECK-NEXT:    cmge v2.2s, v2.2s, #0
-; CHECK-NEXT:    cmge v0.2s, v0.2s, #0
-; CHECK-NEXT:    cmge v5.2s, v3.2s, #0
 ; CHECK-NEXT:    cmlt v4.2s, v3.2s, #0
-; CHECK-NEXT:    cmeq v2.2s, v0.2s, v2.2s
-; CHECK-NEXT:    cmeq v0.2s, v0.2s, v5.2s
 ; CHECK-NEXT:    mvni v1.2s, #128, lsl #24
+; CHECK-NEXT:    cmlt v2.2s, v2.2s, #0
+; CHECK-NEXT:    cmgt v0.2s, v0.2s, v3.2s
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    bsl v1.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v1.8b, v3.8b
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
@@ -443,17 +367,13 @@ define <12 x i8> @v12i8(<12 x i8> %x, <1
 ; CHECK-LABEL: v12i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v2.16b, #0
 ; CHECK-NEXT:    cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT:    cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    movi v3.16b, #127
+; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
@@ -468,27 +388,19 @@ define void @v12i16(<12 x i16>* %px, <12
 ; CHECK-NEXT:    mvni v5.8h, #128, lsl #8
 ; CHECK-NEXT:    mvni v4.8h, #128, lsl #8
 ; CHECK-NEXT:    add v6.8h, v1.8h, v2.8h
-; CHECK-NEXT:    cmlt v16.8h, v6.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v6.8h, #0
+; CHECK-NEXT:    mvn v16.16b, v7.16b
+; CHECK-NEXT:    bsl v5.16b, v7.16b, v16.16b
 ; CHECK-NEXT:    add v7.8h, v0.8h, v3.8h
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v17.16b
+; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT:    cmgt v1.8h, v1.8h, v6.8h
 ; CHECK-NEXT:    cmlt v16.8h, v7.8h, #0
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v4.16b, v16.16b, v17.16b
-; CHECK-NEXT:    cmge v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmge v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmge v16.8h, v6.8h, #0
-; CHECK-NEXT:    cmge v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmge v0.8h, v0.8h, #0
-; CHECK-NEXT:    cmeq v2.8h, v1.8h, v2.8h
-; CHECK-NEXT:    cmeq v1.8h, v1.8h, v16.8h
-; CHECK-NEXT:    cmge v16.8h, v7.8h, #0
-; CHECK-NEXT:    cmeq v3.8h, v0.8h, v3.8h
-; CHECK-NEXT:    cmeq v0.8h, v0.8h, v16.8h
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    cmlt v3.8h, v3.8h, #0
+; CHECK-NEXT:    cmgt v0.8h, v0.8h, v7.8h
+; CHECK-NEXT:    eor v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    mvn v2.16b, v16.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    bsl v4.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    bsl v1.16b, v5.16b, v6.16b
 ; CHECK-NEXT:    bsl v0.16b, v4.16b, v7.16b
 ; CHECK-NEXT:    str q0, [x2]
@@ -508,16 +420,12 @@ define void @v1i8(<1 x i8>* %px, <1 x i8
 ; CHECK-NEXT:    ldr b1, [x1]
 ; CHECK-NEXT:    movi v2.8b, #127
 ; CHECK-NEXT:    add v3.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmge v1.8b, v1.8b, #0
-; CHECK-NEXT:    cmge v0.8b, v0.8b, #0
-; CHECK-NEXT:    cmge v5.8b, v3.8b, #0
 ; CHECK-NEXT:    cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT:    cmeq v1.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmeq v0.8b, v0.8b, v5.8b
+; CHECK-NEXT:    cmlt v1.8b, v1.8b, #0
+; CHECK-NEXT:    cmgt v0.8b, v0.8b, v3.8b
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
@@ -535,16 +443,12 @@ define void @v1i16(<1 x i16>* %px, <1 x
 ; CHECK-NEXT:    ldr h1, [x1]
 ; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-NEXT:    add v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmge v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmge v0.4h, v0.4h, #0
-; CHECK-NEXT:    cmge v5.4h, v3.4h, #0
 ; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmeq v0.4h, v0.4h, v5.4h
+; CHECK-NEXT:    cmlt v1.4h, v1.4h, #0
+; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
@@ -561,17 +465,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <1
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #4
 ; CHECK-NEXT:    add v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v3.16b, #0
 ; CHECK-NEXT:    cmlt v4.16b, v3.16b, #0
-; CHECK-NEXT:    cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    movi v2.16b, #127
+; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #4
 ; CHECK-NEXT:    ret
@@ -585,17 +485,13 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    add v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v3.16b, #0
 ; CHECK-NEXT:    cmlt v4.16b, v3.16b, #0
-; CHECK-NEXT:    cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    movi v2.16b, #127
+; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #7
 ; CHECK-NEXT:    ret
@@ -607,17 +503,13 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2
 ; CHECK-LABEL: v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v2.2s, v0.2s, v1.2s
-; CHECK-NEXT:    cmge v1.2s, v1.2s, #0
-; CHECK-NEXT:    cmge v0.2s, v0.2s, #0
-; CHECK-NEXT:    cmge v5.2s, v2.2s, #0
 ; CHECK-NEXT:    cmlt v4.2s, v2.2s, #0
-; CHECK-NEXT:    cmeq v1.2s, v0.2s, v1.2s
-; CHECK-NEXT:    cmeq v0.2s, v0.2s, v5.2s
 ; CHECK-NEXT:    mvni v3.2s, #128, lsl #24
+; CHECK-NEXT:    cmlt v1.2s, v1.2s, #0
+; CHECK-NEXT:    cmgt v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v3.8b, v2.8b
 ; CHECK-NEXT:    ret
   %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@@ -628,17 +520,13 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4
 ; CHECK-LABEL: v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmge v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
-; CHECK-NEXT:    cmge v5.4s, v2.4s, #0
 ; CHECK-NEXT:    cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT:    cmeq v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v5.4s
 ; CHECK-NEXT:    mvni v3.4s, #128, lsl #24
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@@ -649,29 +537,21 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v4.4s, v0.4s, v2.4s
-; CHECK-NEXT:    cmlt v16.4s, v4.4s, #0
+; CHECK-NEXT:    cmlt v7.4s, v4.4s, #0
 ; CHECK-NEXT:    mvni v6.4s, #128, lsl #24
+; CHECK-NEXT:    mvn v16.16b, v7.16b
+; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
 ; CHECK-NEXT:    add v7.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT:    cmlt v2.4s, v2.4s, #0
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v4.4s
 ; CHECK-NEXT:    cmlt v16.4s, v7.4s, #0
 ; CHECK-NEXT:    mvni v5.4s, #128, lsl #24
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT:    cmge v2.4s, v2.4s, #0
-; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
-; CHECK-NEXT:    cmge v16.4s, v4.4s, #0
-; CHECK-NEXT:    cmge v3.4s, v3.4s, #0
-; CHECK-NEXT:    cmge v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmeq v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v16.4s
-; CHECK-NEXT:    cmge v16.4s, v7.4s, #0
-; CHECK-NEXT:    cmeq v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmeq v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    cmlt v3.4s, v3.4s, #0
+; CHECK-NEXT:    cmgt v1.4s, v1.4s, v7.4s
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v16.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
 ; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
 ; CHECK-NEXT:    ret
@@ -694,42 +574,26 @@ define <16 x i32> @v16i32(<16 x i32> %x,
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
 ; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.4s, v21.4s, #0
+; CHECK-NEXT:    cmlt v4.4s, v4.4s, #0
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v16.4s
 ; CHECK-NEXT:    mvni v22.4s, #128, lsl #24
 ; CHECK-NEXT:    add v23.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    cmlt v4.4s, v5.4s, #0
+; CHECK-NEXT:    cmgt v1.4s, v1.4s, v19.4s
 ; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.4s, v23.4s, #0
+; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    cmlt v4.4s, v6.4s, #0
+; CHECK-NEXT:    cmgt v2.4s, v2.4s, v21.4s
 ; CHECK-NEXT:    mvni v17.4s, #128, lsl #24
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    cmlt v4.4s, v7.4s, #0
+; CHECK-NEXT:    cmgt v3.4s, v3.4s, v23.4s
 ; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmge v4.4s, v4.4s, #0
-; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
-; CHECK-NEXT:    cmge v24.4s, v16.4s, #0
-; CHECK-NEXT:    cmge v5.4s, v5.4s, #0
-; CHECK-NEXT:    cmge v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmeq v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v24.4s
-; CHECK-NEXT:    cmge v24.4s, v19.4s, #0
-; CHECK-NEXT:    cmge v6.4s, v6.4s, #0
-; CHECK-NEXT:    cmge v2.4s, v2.4s, #0
-; CHECK-NEXT:    cmeq v5.4s, v1.4s, v5.4s
-; CHECK-NEXT:    cmeq v1.4s, v1.4s, v24.4s
-; CHECK-NEXT:    cmge v24.4s, v21.4s, #0
-; CHECK-NEXT:    cmge v7.4s, v7.4s, #0
-; CHECK-NEXT:    cmge v3.4s, v3.4s, #0
-; CHECK-NEXT:    cmeq v6.4s, v2.4s, v6.4s
-; CHECK-NEXT:    cmeq v2.4s, v2.4s, v24.4s
-; CHECK-NEXT:    cmge v24.4s, v23.4s, #0
-; CHECK-NEXT:    cmeq v7.4s, v3.4s, v7.4s
-; CHECK-NEXT:    cmeq v3.4s, v3.4s, v24.4s
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v2.16b, v6.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v7.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
 ; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
 ; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
@@ -743,18 +607,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 ; CHECK-LABEL: v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v2.2d, v0.2d, v1.2d
-; CHECK-NEXT:    cmge v1.2d, v1.2d, #0
-; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
-; CHECK-NEXT:    cmge v5.2d, v2.2d, #0
 ; CHECK-NEXT:    mov x8, #9223372036854775807
 ; CHECK-NEXT:    cmlt v3.2d, v2.2d, #0
-; CHECK-NEXT:    cmeq v1.2d, v0.2d, v1.2d
-; CHECK-NEXT:    cmeq v0.2d, v0.2d, v5.2d
+; CHECK-NEXT:    cmlt v1.2d, v1.2d, #0
 ; CHECK-NEXT:    dup v4.2d, x8
+; CHECK-NEXT:    cmgt v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    mvn v5.16b, v3.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v4.16b, v3.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v4.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
@@ -766,31 +626,23 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v4.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    mov x8, #9223372036854775807
-; CHECK-NEXT:    cmlt v6.2d, v4.2d, #0
-; CHECK-NEXT:    dup v7.2d, x8
+; CHECK-NEXT:    cmlt v5.2d, v4.2d, #0
+; CHECK-NEXT:    dup v6.2d, x8
+; CHECK-NEXT:    mvn v7.16b, v5.16b
+; CHECK-NEXT:    mov v16.16b, v6.16b
+; CHECK-NEXT:    bsl v16.16b, v5.16b, v7.16b
 ; CHECK-NEXT:    add v5.2d, v1.2d, v3.2d
-; CHECK-NEXT:    mvn v16.16b, v6.16b
-; CHECK-NEXT:    mov v17.16b, v7.16b
-; CHECK-NEXT:    bsl v17.16b, v6.16b, v16.16b
-; CHECK-NEXT:    cmlt v6.2d, v5.2d, #0
-; CHECK-NEXT:    mvn v16.16b, v6.16b
-; CHECK-NEXT:    bsl v7.16b, v6.16b, v16.16b
-; CHECK-NEXT:    cmge v2.2d, v2.2d, #0
-; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
-; CHECK-NEXT:    cmge v6.2d, v4.2d, #0
-; CHECK-NEXT:    cmge v3.2d, v3.2d, #0
-; CHECK-NEXT:    cmge v1.2d, v1.2d, #0
-; CHECK-NEXT:    cmeq v2.2d, v0.2d, v2.2d
-; CHECK-NEXT:    cmeq v0.2d, v0.2d, v6.2d
-; CHECK-NEXT:    cmge v6.2d, v5.2d, #0
-; CHECK-NEXT:    cmeq v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    cmeq v1.2d, v1.2d, v6.2d
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v0.16b, v17.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v7.16b, v5.16b
+; CHECK-NEXT:    cmlt v2.2d, v2.2d, #0
+; CHECK-NEXT:    cmgt v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    cmlt v7.2d, v5.2d, #0
+; CHECK-NEXT:    cmlt v3.2d, v3.2d, #0
+; CHECK-NEXT:    cmgt v1.2d, v1.2d, v5.2d
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v7.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    bsl v6.16b, v7.16b, v2.16b
+; CHECK-NEXT:    bsl v0.16b, v16.16b, v4.16b
+; CHECK-NEXT:    bsl v1.16b, v6.16b, v5.16b
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -812,42 +664,26 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
 ; CHECK-NEXT:    bsl v25.16b, v20.16b, v24.16b
 ; CHECK-NEXT:    mvn v20.16b, v22.16b
 ; CHECK-NEXT:    mov v24.16b, v21.16b
+; CHECK-NEXT:    cmlt v4.2d, v4.2d, #0
+; CHECK-NEXT:    cmgt v0.2d, v0.2d, v16.2d
 ; CHECK-NEXT:    add v19.2d, v3.2d, v7.2d
 ; CHECK-NEXT:    bsl v24.16b, v22.16b, v20.16b
 ; CHECK-NEXT:    mvn v20.16b, v23.16b
 ; CHECK-NEXT:    mov v22.16b, v21.16b
+; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    cmlt v4.2d, v5.2d, #0
+; CHECK-NEXT:    cmgt v1.2d, v1.2d, v17.2d
 ; CHECK-NEXT:    bsl v22.16b, v23.16b, v20.16b
 ; CHECK-NEXT:    cmlt v20.2d, v19.2d, #0
+; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    cmlt v4.2d, v6.2d, #0
+; CHECK-NEXT:    cmgt v2.2d, v2.2d, v18.2d
 ; CHECK-NEXT:    mvn v23.16b, v20.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    cmlt v4.2d, v7.2d, #0
+; CHECK-NEXT:    cmgt v3.2d, v3.2d, v19.2d
 ; CHECK-NEXT:    bsl v21.16b, v20.16b, v23.16b
-; CHECK-NEXT:    cmge v4.2d, v4.2d, #0
-; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
-; CHECK-NEXT:    cmge v20.2d, v16.2d, #0
-; CHECK-NEXT:    cmge v5.2d, v5.2d, #0
-; CHECK-NEXT:    cmge v1.2d, v1.2d, #0
-; CHECK-NEXT:    cmeq v4.2d, v0.2d, v4.2d
-; CHECK-NEXT:    cmeq v0.2d, v0.2d, v20.2d
-; CHECK-NEXT:    cmge v20.2d, v17.2d, #0
-; CHECK-NEXT:    cmge v6.2d, v6.2d, #0
-; CHECK-NEXT:    cmge v2.2d, v2.2d, #0
-; CHECK-NEXT:    cmeq v5.2d, v1.2d, v5.2d
-; CHECK-NEXT:    cmeq v1.2d, v1.2d, v20.2d
-; CHECK-NEXT:    cmge v20.2d, v18.2d, #0
-; CHECK-NEXT:    cmge v7.2d, v7.2d, #0
-; CHECK-NEXT:    cmge v3.2d, v3.2d, #0
-; CHECK-NEXT:    cmeq v6.2d, v2.2d, v6.2d
-; CHECK-NEXT:    cmeq v2.2d, v2.2d, v20.2d
-; CHECK-NEXT:    cmge v20.2d, v19.2d, #0
-; CHECK-NEXT:    cmeq v7.2d, v3.2d, v7.2d
-; CHECK-NEXT:    cmeq v3.2d, v3.2d, v20.2d
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v2.16b, v6.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v7.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v0.16b, v25.16b, v16.16b
 ; CHECK-NEXT:    bsl v1.16b, v24.16b, v17.16b
 ; CHECK-NEXT:    bsl v2.16b, v22.16b, v18.16b

Modified: llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll Mon Sep 30 00:58:50 2019
@@ -54,18 +54,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x
 ; CHECK-LABEL: vec:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmge v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
-; CHECK-NEXT:    cmge v5.4s, v2.4s, #0
 ; CHECK-NEXT:    cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT:    cmeq v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v5.4s
 ; CHECK-NEXT:    mvni v3.4s, #128, lsl #24
+; CHECK-NEXT:    cmgt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);

Modified: llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll Mon Sep 30 00:58:50 2019
@@ -37,18 +37,13 @@ define <16 x i8> @v16i8(<16 x i8> %x, <1
 ; CHECK-LABEL: v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v2.16b, #0
 ; CHECK-NEXT:    cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT:    cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    movi v3.16b, #127
+; CHECK-NEXT:    cmgt v1.16b, v1.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
@@ -59,31 +54,21 @@ define <32 x i8> @v32i8(<32 x i8> %x, <3
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v4.16b, v0.16b, v2.16b
-; CHECK-NEXT:    cmlt v16.16b, v4.16b, #0
+; CHECK-NEXT:    cmlt v7.16b, v4.16b, #0
 ; CHECK-NEXT:    movi v6.16b, #127
+; CHECK-NEXT:    mvn v16.16b, v7.16b
+; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
 ; CHECK-NEXT:    sub v7.16b, v1.16b, v3.16b
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT:    cmgt v2.16b, v2.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v4.16b
 ; CHECK-NEXT:    cmlt v16.16b, v7.16b, #0
 ; CHECK-NEXT:    movi v5.16b, #127
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT:    cmge v2.16b, v2.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v16.16b, v4.16b, #0
-; CHECK-NEXT:    cmge v3.16b, v3.16b, #0
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmeq v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    cmge v16.16b, v7.16b, #0
-; CHECK-NEXT:    cmeq v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    cmeq v1.16b, v1.16b, v16.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    cmgt v3.16b, v3.16b, #0
+; CHECK-NEXT:    cmgt v1.16b, v1.16b, v7.16b
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v16.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
 ; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
 ; CHECK-NEXT:    ret
@@ -106,46 +91,26 @@ define <64 x i8> @v64i8(<64 x i8> %x, <6
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
 ; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.16b, v21.16b, #0
+; CHECK-NEXT:    cmgt v4.16b, v4.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v16.16b
 ; CHECK-NEXT:    movi v22.16b, #127
 ; CHECK-NEXT:    sub v23.16b, v3.16b, v7.16b
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    cmgt v4.16b, v5.16b, #0
+; CHECK-NEXT:    cmgt v1.16b, v1.16b, v19.16b
 ; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.16b, v23.16b, #0
+; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    cmgt v4.16b, v6.16b, #0
+; CHECK-NEXT:    cmgt v2.16b, v2.16b, v21.16b
 ; CHECK-NEXT:    movi v17.16b, #127
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    cmgt v4.16b, v7.16b, #0
+; CHECK-NEXT:    cmgt v3.16b, v3.16b, v23.16b
 ; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmge v4.16b, v4.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v24.16b, v16.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v5.16b, #0
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmeq v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v24.16b
-; CHECK-NEXT:    cmge v24.16b, v19.16b, #0
-; CHECK-NEXT:    cmge v6.16b, v6.16b, #0
-; CHECK-NEXT:    cmge v2.16b, v2.16b, #0
-; CHECK-NEXT:    cmeq v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    cmeq v1.16b, v1.16b, v24.16b
-; CHECK-NEXT:    cmge v24.16b, v21.16b, #0
-; CHECK-NEXT:    mvn v4.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    cmge v7.16b, v7.16b, #0
-; CHECK-NEXT:    cmge v3.16b, v3.16b, #0
-; CHECK-NEXT:    cmeq v6.16b, v2.16b, v6.16b
-; CHECK-NEXT:    cmeq v2.16b, v2.16b, v24.16b
-; CHECK-NEXT:    cmge v24.16b, v23.16b, #0
-; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    mvn v4.16b, v5.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    cmeq v7.16b, v3.16b, v7.16b
-; CHECK-NEXT:    cmeq v3.16b, v3.16b, v24.16b
-; CHECK-NEXT:    and v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    mvn v4.16b, v6.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    mvn v4.16b, v7.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
 ; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
 ; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
@@ -159,18 +124,13 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8
 ; CHECK-LABEL: v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v2.8h, v0.8h, v1.8h
-; CHECK-NEXT:    cmge v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmge v0.8h, v0.8h, #0
-; CHECK-NEXT:    cmge v5.8h, v2.8h, #0
 ; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT:    cmeq v1.8h, v0.8h, v1.8h
-; CHECK-NEXT:    cmeq v0.8h, v0.8h, v5.8h
 ; CHECK-NEXT:    mvni v3.8h, #128, lsl #8
+; CHECK-NEXT:    cmgt v1.8h, v1.8h, #0
+; CHECK-NEXT:    cmgt v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
@@ -181,31 +141,21 @@ define <16 x i16> @v16i16(<16 x i16> %x,
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v4.8h, v0.8h, v2.8h
-; CHECK-NEXT:    cmlt v16.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v4.8h, #0
 ; CHECK-NEXT:    mvni v6.8h, #128, lsl #8
+; CHECK-NEXT:    mvn v16.16b, v7.16b
+; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
 ; CHECK-NEXT:    sub v7.8h, v1.8h, v3.8h
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT:    cmgt v2.8h, v2.8h, #0
+; CHECK-NEXT:    cmgt v0.8h, v0.8h, v4.8h
 ; CHECK-NEXT:    cmlt v16.8h, v7.8h, #0
 ; CHECK-NEXT:    mvni v5.8h, #128, lsl #8
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT:    cmge v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmge v0.8h, v0.8h, #0
-; CHECK-NEXT:    cmge v16.8h, v4.8h, #0
-; CHECK-NEXT:    cmge v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmge v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmeq v2.8h, v0.8h, v2.8h
-; CHECK-NEXT:    cmeq v0.8h, v0.8h, v16.8h
-; CHECK-NEXT:    cmge v16.8h, v7.8h, #0
-; CHECK-NEXT:    cmeq v3.8h, v1.8h, v3.8h
-; CHECK-NEXT:    cmeq v1.8h, v1.8h, v16.8h
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    cmgt v3.8h, v3.8h, #0
+; CHECK-NEXT:    cmgt v1.8h, v1.8h, v7.8h
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v16.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
 ; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
 ; CHECK-NEXT:    ret
@@ -228,46 +178,26 @@ define <32 x i16> @v32i16(<32 x i16> %x,
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
 ; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.8h, v21.8h, #0
+; CHECK-NEXT:    cmgt v4.8h, v4.8h, #0
+; CHECK-NEXT:    cmgt v0.8h, v0.8h, v16.8h
 ; CHECK-NEXT:    mvni v22.8h, #128, lsl #8
 ; CHECK-NEXT:    sub v23.8h, v3.8h, v7.8h
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    cmgt v4.8h, v5.8h, #0
+; CHECK-NEXT:    cmgt v1.8h, v1.8h, v19.8h
 ; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.8h, v23.8h, #0
+; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    cmgt v4.8h, v6.8h, #0
+; CHECK-NEXT:    cmgt v2.8h, v2.8h, v21.8h
 ; CHECK-NEXT:    mvni v17.8h, #128, lsl #8
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    cmgt v4.8h, v7.8h, #0
+; CHECK-NEXT:    cmgt v3.8h, v3.8h, v23.8h
 ; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmge v4.8h, v4.8h, #0
-; CHECK-NEXT:    cmge v0.8h, v0.8h, #0
-; CHECK-NEXT:    cmge v24.8h, v16.8h, #0
-; CHECK-NEXT:    cmge v5.8h, v5.8h, #0
-; CHECK-NEXT:    cmge v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmeq v4.8h, v0.8h, v4.8h
-; CHECK-NEXT:    cmeq v0.8h, v0.8h, v24.8h
-; CHECK-NEXT:    cmge v24.8h, v19.8h, #0
-; CHECK-NEXT:    cmge v6.8h, v6.8h, #0
-; CHECK-NEXT:    cmge v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmeq v5.8h, v1.8h, v5.8h
-; CHECK-NEXT:    cmeq v1.8h, v1.8h, v24.8h
-; CHECK-NEXT:    cmge v24.8h, v21.8h, #0
-; CHECK-NEXT:    mvn v4.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    cmge v7.8h, v7.8h, #0
-; CHECK-NEXT:    cmge v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmeq v6.8h, v2.8h, v6.8h
-; CHECK-NEXT:    cmeq v2.8h, v2.8h, v24.8h
-; CHECK-NEXT:    cmge v24.8h, v23.8h, #0
-; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    mvn v4.16b, v5.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    cmeq v7.8h, v3.8h, v7.8h
-; CHECK-NEXT:    cmeq v3.8h, v3.8h, v24.8h
-; CHECK-NEXT:    and v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    mvn v4.16b, v6.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    mvn v4.16b, v7.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
 ; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
 ; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
@@ -284,17 +214,12 @@ define void @v8i8(<8 x i8>* %px, <8 x i8
 ; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    movi v2.8b, #127
 ; CHECK-NEXT:    sub v3.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmge v1.8b, v1.8b, #0
-; CHECK-NEXT:    cmge v0.8b, v0.8b, #0
-; CHECK-NEXT:    cmge v5.8b, v3.8b, #0
 ; CHECK-NEXT:    cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT:    cmeq v1.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmeq v0.8b, v0.8b, v5.8b
+; CHECK-NEXT:    cmgt v1.8b, v1.8b, #0
+; CHECK-NEXT:    cmgt v0.8b, v0.8b, v3.8b
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v1.8b, v1.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
@@ -327,18 +252,13 @@ define void @v4i8(<4 x i8>* %px, <4 x i8
 ; CHECK-NEXT:    shl v1.4h, v1.4h, #8
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmge v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmge v0.4h, v0.4h, #0
-; CHECK-NEXT:    cmge v5.4h, v3.4h, #0
 ; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmeq v0.4h, v0.4h, v5.4h
 ; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
+; CHECK-NEXT:    cmgt v1.4h, v1.4h, #0
+; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v1.8b, v1.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
@@ -365,18 +285,13 @@ define void @v2i8(<2 x i8>* %px, <2 x i8
 ; CHECK-NEXT:    shl v2.2s, v2.2s, #24
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    sub v3.2s, v0.2s, v2.2s
-; CHECK-NEXT:    cmge v2.2s, v2.2s, #0
-; CHECK-NEXT:    cmge v0.2s, v0.2s, #0
-; CHECK-NEXT:    cmge v5.2s, v3.2s, #0
 ; CHECK-NEXT:    cmlt v4.2s, v3.2s, #0
-; CHECK-NEXT:    cmeq v2.2s, v0.2s, v2.2s
-; CHECK-NEXT:    cmeq v0.2s, v0.2s, v5.2s
 ; CHECK-NEXT:    mvni v1.2s, #128, lsl #24
+; CHECK-NEXT:    cmgt v2.2s, v2.2s, #0
+; CHECK-NEXT:    cmgt v0.2s, v0.2s, v3.2s
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v2.8b, v2.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    bsl v1.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v1.8b, v3.8b
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
@@ -398,17 +313,12 @@ define void @v4i16(<4 x i16>* %px, <4 x
 ; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-NEXT:    sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmge v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmge v0.4h, v0.4h, #0
-; CHECK-NEXT:    cmge v5.4h, v3.4h, #0
 ; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmeq v0.4h, v0.4h, v5.4h
+; CHECK-NEXT:    cmgt v1.4h, v1.4h, #0
+; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v1.8b, v1.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
@@ -433,18 +343,13 @@ define void @v2i16(<2 x i16>* %px, <2 x
 ; CHECK-NEXT:    shl v2.2s, v2.2s, #16
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-NEXT:    sub v3.2s, v0.2s, v2.2s
-; CHECK-NEXT:    cmge v2.2s, v2.2s, #0
-; CHECK-NEXT:    cmge v0.2s, v0.2s, #0
-; CHECK-NEXT:    cmge v5.2s, v3.2s, #0
 ; CHECK-NEXT:    cmlt v4.2s, v3.2s, #0
-; CHECK-NEXT:    cmeq v2.2s, v0.2s, v2.2s
-; CHECK-NEXT:    cmeq v0.2s, v0.2s, v5.2s
 ; CHECK-NEXT:    mvni v1.2s, #128, lsl #24
+; CHECK-NEXT:    cmgt v2.2s, v2.2s, #0
+; CHECK-NEXT:    cmgt v0.2s, v0.2s, v3.2s
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v2.8b, v2.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    bsl v1.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v1.8b, v3.8b
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
@@ -463,18 +368,13 @@ define <12 x i8> @v12i8(<12 x i8> %x, <1
 ; CHECK-LABEL: v12i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v2.16b, #0
 ; CHECK-NEXT:    cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT:    cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    movi v3.16b, #127
+; CHECK-NEXT:    cmgt v1.16b, v1.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
@@ -489,29 +389,19 @@ define void @v12i16(<12 x i16>* %px, <12
 ; CHECK-NEXT:    mvni v5.8h, #128, lsl #8
 ; CHECK-NEXT:    mvni v4.8h, #128, lsl #8
 ; CHECK-NEXT:    sub v6.8h, v1.8h, v2.8h
-; CHECK-NEXT:    cmlt v16.8h, v6.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v6.8h, #0
+; CHECK-NEXT:    mvn v16.16b, v7.16b
+; CHECK-NEXT:    bsl v5.16b, v7.16b, v16.16b
 ; CHECK-NEXT:    sub v7.8h, v0.8h, v3.8h
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v17.16b
+; CHECK-NEXT:    cmgt v2.8h, v2.8h, #0
+; CHECK-NEXT:    cmgt v1.8h, v1.8h, v6.8h
 ; CHECK-NEXT:    cmlt v16.8h, v7.8h, #0
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v4.16b, v16.16b, v17.16b
-; CHECK-NEXT:    cmge v2.8h, v2.8h, #0
-; CHECK-NEXT:    cmge v1.8h, v1.8h, #0
-; CHECK-NEXT:    cmge v16.8h, v6.8h, #0
-; CHECK-NEXT:    cmge v3.8h, v3.8h, #0
-; CHECK-NEXT:    cmge v0.8h, v0.8h, #0
-; CHECK-NEXT:    cmeq v2.8h, v1.8h, v2.8h
-; CHECK-NEXT:    cmeq v1.8h, v1.8h, v16.8h
-; CHECK-NEXT:    cmge v16.8h, v7.8h, #0
-; CHECK-NEXT:    cmeq v3.8h, v0.8h, v3.8h
-; CHECK-NEXT:    cmeq v0.8h, v0.8h, v16.8h
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    cmgt v3.8h, v3.8h, #0
+; CHECK-NEXT:    cmgt v0.8h, v0.8h, v7.8h
+; CHECK-NEXT:    eor v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    mvn v2.16b, v16.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    bsl v4.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    bsl v1.16b, v5.16b, v6.16b
 ; CHECK-NEXT:    bsl v0.16b, v4.16b, v7.16b
 ; CHECK-NEXT:    str q0, [x2]
@@ -531,17 +421,12 @@ define void @v1i8(<1 x i8>* %px, <1 x i8
 ; CHECK-NEXT:    ldr b1, [x1]
 ; CHECK-NEXT:    movi v2.8b, #127
 ; CHECK-NEXT:    sub v3.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmge v1.8b, v1.8b, #0
-; CHECK-NEXT:    cmge v0.8b, v0.8b, #0
-; CHECK-NEXT:    cmge v5.8b, v3.8b, #0
 ; CHECK-NEXT:    cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT:    cmeq v1.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmeq v0.8b, v0.8b, v5.8b
+; CHECK-NEXT:    cmgt v1.8b, v1.8b, #0
+; CHECK-NEXT:    cmgt v0.8b, v0.8b, v3.8b
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v1.8b, v1.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
@@ -559,17 +444,12 @@ define void @v1i16(<1 x i16>* %px, <1 x
 ; CHECK-NEXT:    ldr h1, [x1]
 ; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-NEXT:    sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmge v1.4h, v1.4h, #0
-; CHECK-NEXT:    cmge v0.4h, v0.4h, #0
-; CHECK-NEXT:    cmge v5.4h, v3.4h, #0
 ; CHECK-NEXT:    cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT:    cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmeq v0.4h, v0.4h, v5.4h
+; CHECK-NEXT:    cmgt v1.4h, v1.4h, #0
+; CHECK-NEXT:    cmgt v0.4h, v0.4h, v3.4h
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v1.8b, v1.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
@@ -586,18 +466,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <1
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #4
 ; CHECK-NEXT:    sub v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v3.16b, #0
 ; CHECK-NEXT:    cmlt v4.16b, v3.16b, #0
-; CHECK-NEXT:    cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    movi v2.16b, #127
+; CHECK-NEXT:    cmgt v1.16b, v1.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #4
 ; CHECK-NEXT:    ret
@@ -611,18 +486,13 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    sub v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmge v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmge v5.16b, v3.16b, #0
 ; CHECK-NEXT:    cmlt v4.16b, v3.16b, #0
-; CHECK-NEXT:    cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    movi v2.16b, #127
+; CHECK-NEXT:    cmgt v1.16b, v1.16b, #0
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #7
 ; CHECK-NEXT:    ret
@@ -634,18 +504,13 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2
 ; CHECK-LABEL: v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v2.2s, v0.2s, v1.2s
-; CHECK-NEXT:    cmge v1.2s, v1.2s, #0
-; CHECK-NEXT:    cmge v0.2s, v0.2s, #0
-; CHECK-NEXT:    cmge v5.2s, v2.2s, #0
 ; CHECK-NEXT:    cmlt v4.2s, v2.2s, #0
-; CHECK-NEXT:    cmeq v1.2s, v0.2s, v1.2s
-; CHECK-NEXT:    cmeq v0.2s, v0.2s, v5.2s
 ; CHECK-NEXT:    mvni v3.2s, #128, lsl #24
+; CHECK-NEXT:    cmgt v1.2s, v1.2s, #0
+; CHECK-NEXT:    cmgt v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    mvn v5.8b, v4.8b
-; CHECK-NEXT:    mvn v1.8b, v1.8b
-; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    bsl v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    bsl v0.8b, v3.8b, v2.8b
 ; CHECK-NEXT:    ret
   %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@@ -656,18 +521,13 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4
 ; CHECK-LABEL: v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmge v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
-; CHECK-NEXT:    cmge v5.4s, v2.4s, #0
 ; CHECK-NEXT:    cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT:    cmeq v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v5.4s
 ; CHECK-NEXT:    mvni v3.4s, #128, lsl #24
+; CHECK-NEXT:    cmgt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    mvn v5.16b, v4.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@@ -678,31 +538,21 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v4.4s, v0.4s, v2.4s
-; CHECK-NEXT:    cmlt v16.4s, v4.4s, #0
+; CHECK-NEXT:    cmlt v7.4s, v4.4s, #0
 ; CHECK-NEXT:    mvni v6.4s, #128, lsl #24
+; CHECK-NEXT:    mvn v16.16b, v7.16b
+; CHECK-NEXT:    bsl v6.16b, v7.16b, v16.16b
 ; CHECK-NEXT:    sub v7.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT:    cmgt v2.4s, v2.4s, #0
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v4.4s
 ; CHECK-NEXT:    cmlt v16.4s, v7.4s, #0
 ; CHECK-NEXT:    mvni v5.4s, #128, lsl #24
-; CHECK-NEXT:    mvn v17.16b, v16.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT:    cmge v2.4s, v2.4s, #0
-; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
-; CHECK-NEXT:    cmge v16.4s, v4.4s, #0
-; CHECK-NEXT:    cmge v3.4s, v3.4s, #0
-; CHECK-NEXT:    cmge v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmeq v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v16.4s
-; CHECK-NEXT:    cmge v16.4s, v7.4s, #0
-; CHECK-NEXT:    cmeq v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmeq v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    cmgt v3.4s, v3.4s, #0
+; CHECK-NEXT:    cmgt v1.4s, v1.4s, v7.4s
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v16.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    bsl v5.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    bsl v0.16b, v6.16b, v4.16b
 ; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
 ; CHECK-NEXT:    ret
@@ -725,46 +575,26 @@ define <16 x i32> @v16i32(<16 x i32> %x,
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
 ; CHECK-NEXT:    bsl v20.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.4s, v21.4s, #0
+; CHECK-NEXT:    cmgt v4.4s, v4.4s, #0
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v16.4s
 ; CHECK-NEXT:    mvni v22.4s, #128, lsl #24
 ; CHECK-NEXT:    sub v23.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    cmgt v4.4s, v5.4s, #0
+; CHECK-NEXT:    cmgt v1.4s, v1.4s, v19.4s
 ; CHECK-NEXT:    bsl v22.16b, v24.16b, v25.16b
 ; CHECK-NEXT:    cmlt v24.4s, v23.4s, #0
+; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    cmgt v4.4s, v6.4s, #0
+; CHECK-NEXT:    cmgt v2.4s, v2.4s, v21.4s
 ; CHECK-NEXT:    mvni v17.4s, #128, lsl #24
 ; CHECK-NEXT:    mvn v25.16b, v24.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    cmgt v4.4s, v7.4s, #0
+; CHECK-NEXT:    cmgt v3.4s, v3.4s, v23.4s
 ; CHECK-NEXT:    bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    cmge v4.4s, v4.4s, #0
-; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
-; CHECK-NEXT:    cmge v24.4s, v16.4s, #0
-; CHECK-NEXT:    cmge v5.4s, v5.4s, #0
-; CHECK-NEXT:    cmge v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmeq v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v24.4s
-; CHECK-NEXT:    cmge v24.4s, v19.4s, #0
-; CHECK-NEXT:    cmge v6.4s, v6.4s, #0
-; CHECK-NEXT:    cmge v2.4s, v2.4s, #0
-; CHECK-NEXT:    cmeq v5.4s, v1.4s, v5.4s
-; CHECK-NEXT:    cmeq v1.4s, v1.4s, v24.4s
-; CHECK-NEXT:    cmge v24.4s, v21.4s, #0
-; CHECK-NEXT:    mvn v4.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    cmge v7.4s, v7.4s, #0
-; CHECK-NEXT:    cmge v3.4s, v3.4s, #0
-; CHECK-NEXT:    cmeq v6.4s, v2.4s, v6.4s
-; CHECK-NEXT:    cmeq v2.4s, v2.4s, v24.4s
-; CHECK-NEXT:    cmge v24.4s, v23.4s, #0
-; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    mvn v4.16b, v5.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    cmeq v7.4s, v3.4s, v7.4s
-; CHECK-NEXT:    cmeq v3.4s, v3.4s, v24.4s
-; CHECK-NEXT:    and v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    mvn v4.16b, v6.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    mvn v4.16b, v7.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v0.16b, v18.16b, v16.16b
 ; CHECK-NEXT:    bsl v1.16b, v20.16b, v19.16b
 ; CHECK-NEXT:    bsl v2.16b, v22.16b, v21.16b
@@ -778,19 +608,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 ; CHECK-LABEL: v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v2.2d, v0.2d, v1.2d
-; CHECK-NEXT:    cmge v1.2d, v1.2d, #0
-; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
-; CHECK-NEXT:    cmge v5.2d, v2.2d, #0
 ; CHECK-NEXT:    mov x8, #9223372036854775807
 ; CHECK-NEXT:    cmlt v3.2d, v2.2d, #0
-; CHECK-NEXT:    cmeq v1.2d, v0.2d, v1.2d
-; CHECK-NEXT:    cmeq v0.2d, v0.2d, v5.2d
+; CHECK-NEXT:    cmgt v1.2d, v1.2d, #0
 ; CHECK-NEXT:    dup v4.2d, x8
+; CHECK-NEXT:    cmgt v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    mvn v5.16b, v3.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    bsl v4.16b, v3.16b, v5.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    bsl v0.16b, v4.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
@@ -802,33 +627,23 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub v4.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    mov x8, #9223372036854775807
-; CHECK-NEXT:    cmlt v6.2d, v4.2d, #0
-; CHECK-NEXT:    dup v7.2d, x8
+; CHECK-NEXT:    cmlt v5.2d, v4.2d, #0
+; CHECK-NEXT:    dup v6.2d, x8
+; CHECK-NEXT:    mvn v7.16b, v5.16b
+; CHECK-NEXT:    mov v16.16b, v6.16b
+; CHECK-NEXT:    bsl v16.16b, v5.16b, v7.16b
 ; CHECK-NEXT:    sub v5.2d, v1.2d, v3.2d
-; CHECK-NEXT:    mvn v16.16b, v6.16b
-; CHECK-NEXT:    mov v17.16b, v7.16b
-; CHECK-NEXT:    bsl v17.16b, v6.16b, v16.16b
-; CHECK-NEXT:    cmlt v6.2d, v5.2d, #0
-; CHECK-NEXT:    mvn v16.16b, v6.16b
-; CHECK-NEXT:    bsl v7.16b, v6.16b, v16.16b
-; CHECK-NEXT:    cmge v2.2d, v2.2d, #0
-; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
-; CHECK-NEXT:    cmge v6.2d, v4.2d, #0
-; CHECK-NEXT:    cmge v3.2d, v3.2d, #0
-; CHECK-NEXT:    cmge v1.2d, v1.2d, #0
-; CHECK-NEXT:    cmeq v2.2d, v0.2d, v2.2d
-; CHECK-NEXT:    cmeq v0.2d, v0.2d, v6.2d
-; CHECK-NEXT:    cmge v6.2d, v5.2d, #0
-; CHECK-NEXT:    cmeq v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    cmeq v1.2d, v1.2d, v6.2d
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    bsl v0.16b, v17.16b, v4.16b
-; CHECK-NEXT:    bsl v1.16b, v7.16b, v5.16b
+; CHECK-NEXT:    cmgt v2.2d, v2.2d, #0
+; CHECK-NEXT:    cmgt v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    cmlt v7.2d, v5.2d, #0
+; CHECK-NEXT:    cmgt v3.2d, v3.2d, #0
+; CHECK-NEXT:    cmgt v1.2d, v1.2d, v5.2d
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v7.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    bsl v6.16b, v7.16b, v2.16b
+; CHECK-NEXT:    bsl v0.16b, v16.16b, v4.16b
+; CHECK-NEXT:    bsl v1.16b, v6.16b, v5.16b
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -850,46 +665,26 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
 ; CHECK-NEXT:    bsl v25.16b, v20.16b, v24.16b
 ; CHECK-NEXT:    mvn v20.16b, v22.16b
 ; CHECK-NEXT:    mov v24.16b, v21.16b
+; CHECK-NEXT:    cmgt v4.2d, v4.2d, #0
+; CHECK-NEXT:    cmgt v0.2d, v0.2d, v16.2d
 ; CHECK-NEXT:    sub v19.2d, v3.2d, v7.2d
 ; CHECK-NEXT:    bsl v24.16b, v22.16b, v20.16b
 ; CHECK-NEXT:    mvn v20.16b, v23.16b
 ; CHECK-NEXT:    mov v22.16b, v21.16b
+; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    cmgt v4.2d, v5.2d, #0
+; CHECK-NEXT:    cmgt v1.2d, v1.2d, v17.2d
 ; CHECK-NEXT:    bsl v22.16b, v23.16b, v20.16b
 ; CHECK-NEXT:    cmlt v20.2d, v19.2d, #0
+; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    cmgt v4.2d, v6.2d, #0
+; CHECK-NEXT:    cmgt v2.2d, v2.2d, v18.2d
 ; CHECK-NEXT:    mvn v23.16b, v20.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    cmgt v4.2d, v7.2d, #0
+; CHECK-NEXT:    cmgt v3.2d, v3.2d, v19.2d
 ; CHECK-NEXT:    bsl v21.16b, v20.16b, v23.16b
-; CHECK-NEXT:    cmge v4.2d, v4.2d, #0
-; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
-; CHECK-NEXT:    cmge v20.2d, v16.2d, #0
-; CHECK-NEXT:    cmge v5.2d, v5.2d, #0
-; CHECK-NEXT:    cmge v1.2d, v1.2d, #0
-; CHECK-NEXT:    cmeq v4.2d, v0.2d, v4.2d
-; CHECK-NEXT:    cmeq v0.2d, v0.2d, v20.2d
-; CHECK-NEXT:    cmge v20.2d, v17.2d, #0
-; CHECK-NEXT:    cmge v6.2d, v6.2d, #0
-; CHECK-NEXT:    cmge v2.2d, v2.2d, #0
-; CHECK-NEXT:    cmeq v5.2d, v1.2d, v5.2d
-; CHECK-NEXT:    cmeq v1.2d, v1.2d, v20.2d
-; CHECK-NEXT:    cmge v20.2d, v18.2d, #0
-; CHECK-NEXT:    mvn v4.16b, v4.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    cmge v7.2d, v7.2d, #0
-; CHECK-NEXT:    cmge v3.2d, v3.2d, #0
-; CHECK-NEXT:    cmeq v6.2d, v2.2d, v6.2d
-; CHECK-NEXT:    cmeq v2.2d, v2.2d, v20.2d
-; CHECK-NEXT:    cmge v20.2d, v19.2d, #0
-; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    mvn v4.16b, v5.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    cmeq v7.2d, v3.2d, v7.2d
-; CHECK-NEXT:    cmeq v3.2d, v3.2d, v20.2d
-; CHECK-NEXT:    and v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    mvn v4.16b, v6.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    mvn v4.16b, v7.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v0.16b, v25.16b, v16.16b
 ; CHECK-NEXT:    bsl v1.16b, v24.16b, v17.16b
 ; CHECK-NEXT:    bsl v2.16b, v22.16b, v18.16b

Modified: llvm/trunk/test/CodeGen/AMDGPU/saddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/saddo.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/saddo.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/saddo.ll Mon Sep 30 00:58:50 2019
@@ -13,29 +13,25 @@ declare { <2 x i32>, <2 x i1> } @llvm.sa
 define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ; SI-LABEL: saddo_i64_zext:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s8
-; SI-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
-; SI-NEXT:    s_mov_b32 s5, s9
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; SI-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; SI-NEXT:    s_add_u32 s2, s10, s0
-; SI-NEXT:    s_addc_u32 s3, s11, s1
-; SI-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, v0
-; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    s_add_u32 s10, s6, s8
+; SI-NEXT:    s_addc_u32 s11, s7, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; SI-NEXT:    v_cmp_lt_i64_e64 s[6:7], s[8:9], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_xor_b64 s[4:5], s[6:7], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v1, s11
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s10, v0
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: saddo_i64_zext:
@@ -43,22 +39,18 @@ define amdgpu_kernel void @saddo_i64_zex
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    s_add_u32 s8, s6, s0
+; VI-NEXT:    s_addc_u32 s9, s7, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2]
+; VI-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s8, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
-; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[2:3]
-; VI-NEXT:    s_add_u32 s2, s6, s0
-; VI-NEXT:    s_addc_u32 s3, s7, s1
-; VI-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v3, v2
-; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -68,22 +60,18 @@ define amdgpu_kernel void @saddo_i64_zex
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    s_add_u32 s8, s6, s0
+; GFX9-NEXT:    s_addc_u32 s9, s7, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[2:3]
-; GFX9-NEXT:    s_add_u32 s2, s6, s0
-; GFX9-NEXT:    s_addc_u32 s3, s7, s1
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], v3, v2
-; GFX9-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm
@@ -99,32 +87,27 @@ define amdgpu_kernel void @saddo_i64_zex
 define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
 ; SI-LABEL: s_saddo_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s8
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s1, -1
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s0, -1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; SI-NEXT:    s_add_i32 s2, s0, s1
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s2, -1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, v0
-; SI-NEXT:    s_mov_b32 s5, s9
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_mov_b32 s10, s6
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    v_cmp_lt_i32_e64 s[10:11], s9, 0
+; SI-NEXT:    s_add_i32 s9, s8, s9
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, s9, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_xor_b64 s[0:1], s[10:11], vcc
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s2
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_saddo_i32:
@@ -133,18 +116,13 @@ define amdgpu_kernel void @s_saddo_i32(i
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s1, -1
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[2:3]
-; VI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s0, -1
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[2:3]
-; VI-NEXT:    s_add_i32 s2, s0, s1
-; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s2, -1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v5, v4
+; VI-NEXT:    v_cmp_lt_i32_e64 s[2:3], s1, 0
+; VI-NEXT:    s_add_i32 s1, s0, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, s1, v4
+; VI-NEXT:    v_mov_b32_e32 v4, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
@@ -158,18 +136,13 @@ define amdgpu_kernel void @s_saddo_i32(i
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_cmp_gt_i32_e64 s[2:3], s1, -1
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[2:3]
-; GFX9-NEXT:    v_cmp_gt_i32_e64 s[2:3], s0, -1
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[2:3]
-; GFX9-NEXT:    s_add_i32 s2, s0, s1
-; GFX9-NEXT:    v_cmp_gt_i32_e64 s[0:1], s2, -1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], v5, v4
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[2:3], s1, 0
+; GFX9-NEXT:    s_add_i32 s1, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, s1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
 ; GFX9-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
@@ -204,19 +177,12 @@ define amdgpu_kernel void @v_saddo_i32(i
 ; SI-NEXT:    s_mov_b32 s6, s14
 ; SI-NEXT:    s_mov_b32 s7, s15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
-; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v3, v1
-; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
+; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v2, v0
+; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; SI-NEXT:    buffer_store_byte v0, off, s[12:15], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -235,17 +201,11 @@ define amdgpu_kernel void @v_saddo_i32(i
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v4
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v6
-; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
-; VI-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v4
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v5
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v7, v5
-; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT:    flat_store_dword v[2:3], v4
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v4, v6
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v4
+; VI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v5, v6
+; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    flat_store_dword v[2:3], v5
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -265,17 +225,11 @@ define amdgpu_kernel void @v_saddo_i32(i
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v4
-; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], v7, v5
-; GFX9-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    global_store_dword v[2:3], v4, off
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v4
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[0:1], v5, v6
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    global_store_dword v[2:3], v5, off
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -292,31 +246,27 @@ define amdgpu_kernel void @v_saddo_i32(i
 define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
 ; SI-LABEL: s_saddo_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s15, 0xf000
-; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], -1
-; SI-NEXT:    s_add_u32 s2, s8, s10
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[8:9], -1
-; SI-NEXT:    s_addc_u32 s3, s9, s11
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_mov_b32 s12, s6
-; SI-NEXT:    s_mov_b32 s13, s7
-; SI-NEXT:    s_mov_b32 s6, s14
-; SI-NEXT:    s_mov_b32 s7, s15
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_add_u32 s12, s4, s6
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_addc_u32 s13, s5, s7
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
+; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_xor_b64 s[0:1], s[4:5], vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT:    buffer_store_byte v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_saddo_i64:
@@ -324,22 +274,18 @@ define amdgpu_kernel void @s_saddo_i64(i
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    s_add_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
+; VI-NEXT:    s_addc_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
-; VI-NEXT:    s_add_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_addc_u32 s3, s5, s7
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; VI-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v5, v4
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -350,22 +296,18 @@ define amdgpu_kernel void @s_saddo_i64(i
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_add_u32 s0, s4, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
+; GFX9-NEXT:    s_addc_u32 s1, s5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
-; GFX9-NEXT:    s_add_u32 s2, s4, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    s_addc_u32 s3, s5, s7
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], v5, v4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
@@ -398,19 +340,12 @@ define amdgpu_kernel void @v_saddo_i64(i
 ; SI-NEXT:    s_mov_b32 s6, s14
 ; SI-NEXT:    s_mov_b32 s7, s15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[2:3]
-; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; SI-NEXT:    v_cmp_lt_i64_e64 s[0:1], -1, v[0:1]
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v5, v2
-; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
+; SI-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
+; SI-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; SI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    buffer_store_byte v0, off, s[12:15], 0
 ; SI-NEXT:    s_endpgm
@@ -430,18 +365,12 @@ define amdgpu_kernel void @v_saddo_i64(i
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; VI-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[6:7]
-; VI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; VI-NEXT:    v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v8
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], v9, v6
-; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[4:5]
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v6, v4
+; VI-NEXT:    v_addc_u32_e32 v9, vcc, v7, v5, vcc
+; VI-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7]
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[8:9]
+; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -461,18 +390,12 @@ define amdgpu_kernel void @v_saddo_i64(i
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v5, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], v9, v6
-; GFX9-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -489,48 +412,35 @@ define amdgpu_kernel void @v_saddo_i64(i
 define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
 ; SI-LABEL: v_saddo_v2i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s15, 0xf000
-; SI-NEXT:    s_mov_b32 s14, -1
-; SI-NEXT:    s_mov_b32 s2, s14
-; SI-NEXT:    s_mov_b32 s3, s15
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s19, 0xf000
+; SI-NEXT:    s_mov_b32 s18, -1
+; SI-NEXT:    s_mov_b32 s2, s18
+; SI-NEXT:    s_mov_b32 s3, s19
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s10
-; SI-NEXT:    s_mov_b32 s1, s11
-; SI-NEXT:    s_mov_b32 s10, s14
-; SI-NEXT:    s_mov_b32 s11, s15
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s14
+; SI-NEXT:    s_mov_b32 s1, s15
+; SI-NEXT:    s_mov_b32 s14, s18
+; SI-NEXT:    s_mov_b32 s15, s19
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
-; SI-NEXT:    s_mov_b32 s12, s6
-; SI-NEXT:    s_mov_b32 s13, s7
-; SI-NEXT:    s_mov_b32 s6, s14
-; SI-NEXT:    s_mov_b32 s7, s15
+; SI-NEXT:    s_mov_b32 s16, s10
+; SI-NEXT:    s_mov_b32 s17, s11
+; SI-NEXT:    s_mov_b32 s10, s18
+; SI-NEXT:    s_mov_b32 s11, s19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v0
-; SI-NEXT:    v_cmp_lt_i32_e64 s[2:3], -1, v1
-; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v6, v2
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; SI-NEXT:    v_cmp_ne_u32_e64 s[4:5], v6, v2
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
-; SI-NEXT:    v_cmp_ne_u32_e64 s[2:3], v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v5, vcc, v1, v3
+; SI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
+; SI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v4, v0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT:    s_and_b64 s[0:1], vcc, s[2:3]
+; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_saddo_v2i32:
@@ -543,33 +453,21 @@ define amdgpu_kernel void @v_saddo_v2i32
 ; VI-NEXT:    v_mov_b32_e32 v7, s5
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
 ; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v4
-; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v6
-; VI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v5
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v7
-; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; VI-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v4
-; VI-NEXT:    v_cmp_lt_i32_e64 s[2:3], -1, v5
-; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v10, v6
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], v10, v6
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v8
-; VI-NEXT:    v_cmp_ne_u32_e64 s[2:3], v9, v7
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[4:5]
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v6, v4
+; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v5
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v9, v7
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v4
+; VI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v8, v6
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[8:9]
 ; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], vcc, s[2:3]
+; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -584,33 +482,21 @@ define amdgpu_kernel void @v_saddo_v2i32
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[6:7], off
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v5
-; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v4
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[2:3], -1, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], v10, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], v10, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v8
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], v9, v7
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
+; GFX9-NEXT:    v_add_u32_e32 v9, v7, v5
+; GFX9-NEXT:    v_add_u32_e32 v8, v6, v4
+; GFX9-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], v9, v7
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[2:3], v8, v6
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; GFX9-NEXT:    s_and_b64 s[0:1], vcc, s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm

Modified: llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll Mon Sep 30 00:58:50 2019
@@ -95,76 +95,48 @@ define <2 x i1> @usubo(<2 x i64> *%ptr,
 define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
 ; CHECK-LABEL: saddo:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vmov.32 r1, d16[1]
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
-; CHECK-NEXT:    vmov.32 r2, d17[1]
-; CHECK-NEXT:    vadd.i64 q8, q9, q8
-; CHECK-NEXT:    vmov.32 r12, d18[1]
-; CHECK-NEXT:    vmov.32 r4, d19[1]
-; CHECK-NEXT:    vmov.32 lr, d16[1]
-; CHECK-NEXT:    vmov.32 r7, d17[1]
-; CHECK-NEXT:    cmp.w r1, #-1
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
+; CHECK-NEXT:    vadd.i64 q8, q10, q9
+; CHECK-NEXT:    vmov.32 r2, d20[0]
+; CHECK-NEXT:    vmov.32 r1, d20[1]
+; CHECK-NEXT:    vmov.32 r12, d16[0]
+; CHECK-NEXT:    vmov.32 r8, d16[1]
+; CHECK-NEXT:    vmov.32 lr, d17[0]
+; CHECK-NEXT:    vmov.32 r4, d21[0]
+; CHECK-NEXT:    vmov.32 r5, d17[1]
+; CHECK-NEXT:    vmov.32 r6, d18[1]
+; CHECK-NEXT:    vmov.32 r7, d21[1]
+; CHECK-NEXT:    subs.w r2, r12, r2
+; CHECK-NEXT:    vmov.32 r2, d19[1]
+; CHECK-NEXT:    sbcs.w r1, r8, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r1, #-1
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r2, #1
-; CHECK-NEXT:    cmp.w r12, #-1
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #1
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r5, #-1
-; CHECK-NEXT:    cmp.w r4, #-1
-; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r4, #1
-; CHECK-NEXT:    cmp.w lr, #-1
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r6, #1
-; CHECK-NEXT:    cmp r6, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r6, #-1
-; CHECK-NEXT:    cmp.w r7, #-1
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #1
+; CHECK-NEXT:    subs.w r4, lr, r4
+; CHECK-NEXT:    sbcs.w r7, r5, r7
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r3, #-1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    vdup.32 d19, r3
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r4, #-1
-; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    asrs r7, r6, #31
+; CHECK-NEXT:    vdup.32 d21, r3
+; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    vdup.32 d23, r2
-; CHECK-NEXT:    vdup.32 d21, r4
-; CHECK-NEXT:    vdup.32 d18, r6
-; CHECK-NEXT:    vdup.32 d22, r1
-; CHECK-NEXT:    vdup.32 d20, r5
-; CHECK-NEXT:    vceq.i32 q9, q10, q9
+; CHECK-NEXT:    movne.w r1, #-1
+; CHECK-NEXT:    vdup.32 d20, r1
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    vceq.i32 q10, q10, q11
-; CHECK-NEXT:    vrev64.32 q11, q9
-; CHECK-NEXT:    vrev64.32 q12, q10
-; CHECK-NEXT:    vand q9, q9, q11
-; CHECK-NEXT:    vand q10, q10, q12
-; CHECK-NEXT:    vbic q9, q10, q9
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vdup.32 d19, r2
+; CHECK-NEXT:    vdup.32 d18, r7
+; CHECK-NEXT:    veor q9, q9, q10
 ; CHECK-NEXT:    vmovn.i64 d18, q9
 ; CHECK-NEXT:    vmov r2, r1, d18
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
   %x = load <2 x i64>, <2 x i64>* %ptr, align 8
   %y = load <2 x i64>, <2 x i64>* %ptr2, align 8
   %s = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)
@@ -177,77 +149,64 @@ define <2 x i1> @saddo(<2 x i64> *%ptr,
 define <2 x i1> @ssubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
 ; CHECK-LABEL: ssubo:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    movs r6, #0
-; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
 ; CHECK-NEXT:    vsub.i64 q8, q10, q9
+; CHECK-NEXT:    vmov.32 r1, d20[0]
 ; CHECK-NEXT:    vmov.32 r12, d20[1]
-; CHECK-NEXT:    vmov.32 lr, d21[1]
-; CHECK-NEXT:    vmov.32 r1, d16[1]
-; CHECK-NEXT:    vmov.32 r2, d17[1]
-; CHECK-NEXT:    vmov.32 r4, d18[1]
-; CHECK-NEXT:    vmov.32 r7, d19[1]
-; CHECK-NEXT:    cmp.w r1, #-1
+; CHECK-NEXT:    vmov.32 r3, d16[0]
+; CHECK-NEXT:    vmov.32 lr, d16[1]
+; CHECK-NEXT:    vmov.32 r4, d21[0]
+; CHECK-NEXT:    vmov.32 r5, d17[0]
+; CHECK-NEXT:    vmov.32 r6, d21[1]
+; CHECK-NEXT:    vmov.32 r7, d17[1]
+; CHECK-NEXT:    vmov.32 r8, d18[1]
+; CHECK-NEXT:    subs r1, r3, r1
+; CHECK-NEXT:    vmov.32 r3, d18[0]
+; CHECK-NEXT:    sbcs.w r1, lr, r12
+; CHECK-NEXT:    vmov.32 r12, d19[0]
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r1, #1
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #1
+; CHECK-NEXT:    subs r5, r5, r4
+; CHECK-NEXT:    vmov.32 r5, d19[1]
+; CHECK-NEXT:    sbcs r7, r6
+; CHECK-NEXT:    mov.w r7, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r7, #1
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r7, #-1
+; CHECK-NEXT:    vdup.32 d21, r7
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    sbcs.w r3, r2, r8
+; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    rsbs.w r6, r12, #0
+; CHECK-NEXT:    sbcs.w r6, r2, r5
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r2, #1
+; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r1, #-1
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r2, #1
-; CHECK-NEXT:    cmp.w r12, #-1
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #1
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r5, #-1
-; CHECK-NEXT:    cmp.w lr, #-1
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r6, #1
-; CHECK-NEXT:    cmp.w r4, #-1
-; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r4, #-1
-; CHECK-NEXT:    cmp.w r7, #-1
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #1
+; CHECK-NEXT:    movne.w r2, #-1
 ; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    vdup.32 d19, r2
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r3, #-1
-; CHECK-NEXT:    vdup.32 d19, r3
-; CHECK-NEXT:    cmp r6, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r6, #-1
-; CHECK-NEXT:    vdup.32 d21, r6
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vdup.32 d18, r4
+; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    vdup.32 d23, r2
-; CHECK-NEXT:    vdup.32 d20, r5
-; CHECK-NEXT:    vdup.32 d22, r1
-; CHECK-NEXT:    vceq.i32 q9, q10, q9
+; CHECK-NEXT:    movne.w r1, #-1
+; CHECK-NEXT:    vdup.32 d18, r3
+; CHECK-NEXT:    vdup.32 d20, r1
+; CHECK-NEXT:    veor q9, q9, q10
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    vceq.i32 q10, q10, q11
-; CHECK-NEXT:    vrev64.32 q11, q9
-; CHECK-NEXT:    vrev64.32 q12, q10
-; CHECK-NEXT:    vand q9, q9, q11
-; CHECK-NEXT:    vand q10, q10, q12
-; CHECK-NEXT:    vmvn q9, q9
-; CHECK-NEXT:    vbic q9, q9, q10
 ; CHECK-NEXT:    vmovn.i64 d18, q9
 ; CHECK-NEXT:    vmov r2, r1, d18
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
   %x = load <2 x i64>, <2 x i64>* %ptr, align 8
   %y = load <2 x i64>, <2 x i64>* %ptr2, align 8
   %s = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)

Modified: llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll (original)
+++ llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll Mon Sep 30 00:58:50 2019
@@ -10,17 +10,11 @@ declare {i32, i1} @llvm.usub.with.overfl
 define i1 @sadd(i32 %a, i32 %b, i32* %c) nounwind {
 ; RV32I-LABEL: sadd:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi a3, zero, -1
-; RV32I-NEXT:    slt a4, a3, a1
-; RV32I-NEXT:    slt a5, a3, a0
-; RV32I-NEXT:    xor a4, a5, a4
-; RV32I-NEXT:    seqz a4, a4
-; RV32I-NEXT:    add a1, a0, a1
-; RV32I-NEXT:    slt a0, a3, a1
-; RV32I-NEXT:    xor a0, a5, a0
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    and a0, a4, a0
-; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    add a3, a0, a1
+; RV32I-NEXT:    slt a0, a3, a0
+; RV32I-NEXT:    slti a1, a1, 0
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    sw a3, 0(a2)
 ; RV32I-NEXT:    ret
 entry:
   %x = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@@ -33,16 +27,10 @@ entry:
 define i1 @ssub(i32 %a, i32 %b, i32* %c) nounwind {
 ; RV32I-LABEL: ssub:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi a3, zero, -1
-; RV32I-NEXT:    slt a4, a3, a1
-; RV32I-NEXT:    slt a5, a3, a0
-; RV32I-NEXT:    xor a4, a5, a4
-; RV32I-NEXT:    snez a4, a4
+; RV32I-NEXT:    sgtz a3, a1
 ; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    slt a0, a3, a1
-; RV32I-NEXT:    xor a0, a5, a0
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    and a0, a4, a0
+; RV32I-NEXT:    slt a0, a1, a0
+; RV32I-NEXT:    xor a0, a3, a0
 ; RV32I-NEXT:    sw a1, 0(a2)
 ; RV32I-NEXT:    ret
 entry:

Modified: llvm/trunk/test/CodeGen/X86/combine-mulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-mulo.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-mulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-mulo.ll Mon Sep 30 00:58:50 2019
@@ -34,30 +34,21 @@ define <4 x i32> @combine_vec_smul_two(<
 ; SSE-LABEL: combine_vec_smul_two:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm0, %xmm0
-; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    paddd %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
 ; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE-NEXT:    pxor %xmm4, %xmm3
-; SSE-NEXT:    paddd %xmm2, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE-NEXT:    pxor %xmm4, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_smul_two:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm3
-; AVX-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX-NEXT:    retq
   %1 = call {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
   %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0

Modified: llvm/trunk/test/CodeGen/X86/mulo-pow2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mulo-pow2.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mulo-pow2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mulo-pow2.ll Mon Sep 30 00:58:50 2019
@@ -98,15 +98,10 @@ define <4 x i32> @smul_v4i32_1(<4 x i32>
 define <4 x i32> @smul_v4i32_2(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX-LABEL: smul_v4i32_2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm3
-; AVX-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX-NEXT:    retq
     %x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
     %y = extractvalue { <4 x i32>, <4 x i1> } %x, 0

Modified: llvm/trunk/test/CodeGen/X86/sadd_sat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sadd_sat.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sadd_sat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sadd_sat.ll Mon Sep 30 00:58:50 2019
@@ -183,28 +183,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x
 ;
 ; X64-LABEL: vec:
 ; X64:       # %bb.0:
+; X64-NEXT:    pxor %xmm2, %xmm2
 ; X64-NEXT:    pxor %xmm3, %xmm3
-; X64-NEXT:    pxor %xmm4, %xmm4
-; X64-NEXT:    pcmpgtd %xmm1, %xmm4
-; X64-NEXT:    pcmpeqd %xmm2, %xmm2
-; X64-NEXT:    pxor %xmm2, %xmm4
-; X64-NEXT:    pxor %xmm5, %xmm5
-; X64-NEXT:    pcmpgtd %xmm0, %xmm5
-; X64-NEXT:    pxor %xmm2, %xmm5
-; X64-NEXT:    pcmpeqd %xmm5, %xmm4
-; X64-NEXT:    paddd %xmm1, %xmm0
-; X64-NEXT:    pcmpgtd %xmm0, %xmm3
-; X64-NEXT:    pxor %xmm3, %xmm2
-; X64-NEXT:    pcmpeqd %xmm5, %xmm2
-; X64-NEXT:    pandn %xmm4, %xmm2
-; X64-NEXT:    movdqa %xmm3, %xmm1
-; X64-NEXT:    pandn {{.*}}(%rip), %xmm1
-; X64-NEXT:    psrld $1, %xmm3
-; X64-NEXT:    por %xmm1, %xmm3
-; X64-NEXT:    pand %xmm2, %xmm3
-; X64-NEXT:    pandn %xmm0, %xmm2
+; X64-NEXT:    pcmpgtd %xmm1, %xmm3
+; X64-NEXT:    paddd %xmm0, %xmm1
+; X64-NEXT:    pcmpgtd %xmm1, %xmm0
+; X64-NEXT:    pxor %xmm3, %xmm0
+; X64-NEXT:    pcmpgtd %xmm1, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm3
+; X64-NEXT:    pandn {{.*}}(%rip), %xmm3
+; X64-NEXT:    psrld $1, %xmm2
 ; X64-NEXT:    por %xmm3, %xmm2
-; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    pandn %xmm1, %xmm0
+; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
   %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
   ret <4 x i32> %tmp;

Modified: llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll Mon Sep 30 00:58:50 2019
@@ -598,133 +598,88 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
 define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-LABEL: v2i32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm2, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT:    pandn %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i32:
 ; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT:    pxor %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT:    pandn %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    psrld $1, %xmm3
-; SSSE3-NEXT:    por %xmm1, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm3
+; SSSE3-NEXT:    psrld $1, %xmm2
 ; SSSE3-NEXT:    por %xmm3, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    pxor %xmm3, %xmm4
-; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm3, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT:    paddd %xmm1, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT:    pandn %xmm4, %xmm3
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    paddd %xmm1, %xmm3
+; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v2i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v2i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %z
@@ -733,133 +688,88 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2
 define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-LABEL: v4i32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm2, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT:    pandn %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i32:
 ; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT:    pxor %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT:    pandn %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    psrld $1, %xmm3
-; SSSE3-NEXT:    por %xmm1, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm3
+; SSSE3-NEXT:    psrld $1, %xmm2
 ; SSSE3-NEXT:    por %xmm3, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    pxor %xmm3, %xmm4
-; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm3, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT:    paddd %xmm1, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT:    pandn %xmm4, %xmm3
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    paddd %xmm1, %xmm3
+; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v4i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %z
@@ -868,214 +778,135 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm8
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm5, %xmm7
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm7
-; SSE2-NEXT:    paddd %xmm2, %xmm8
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm5, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
-; SSE2-NEXT:    pandn %xmm7, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:    pandn %xmm4, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    por %xmm7, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pandn %xmm8, %xmm0
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm6, %xmm7
+; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm2
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
-; SSE2-NEXT:    pxor %xmm5, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT:    pxor %xmm6, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm5
-; SSE2-NEXT:    pandn %xmm2, %xmm5
-; SSE2-NEXT:    movdqa %xmm6, %xmm2
-; SSE2-NEXT:    pandn %xmm4, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    por %xmm2, %xmm6
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    por %xmm6, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm8
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT:    pxor %xmm5, %xmm7
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm7
-; SSSE3-NEXT:    paddd %xmm2, %xmm8
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT:    paddd %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm5, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
-; SSSE3-NEXT:    pandn %xmm7, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm7
-; SSSE3-NEXT:    pandn %xmm4, %xmm7
-; SSSE3-NEXT:    psrld $1, %xmm2
-; SSSE3-NEXT:    por %xmm7, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pandn %xmm8, %xmm0
-; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm7
+; SSSE3-NEXT:    pandn %xmm6, %xmm7
+; SSSE3-NEXT:    psrld $1, %xmm5
+; SSSE3-NEXT:    por %xmm7, %xmm5
+; SSSE3-NEXT:    pand %xmm0, %xmm5
+; SSSE3-NEXT:    pandn %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm5, %xmm0
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm2
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm7
-; SSSE3-NEXT:    pxor %xmm5, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSSE3-NEXT:    paddd %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT:    pxor %xmm6, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm5
-; SSSE3-NEXT:    pandn %xmm2, %xmm5
-; SSSE3-NEXT:    movdqa %xmm6, %xmm2
-; SSSE3-NEXT:    pandn %xmm4, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm6
-; SSSE3-NEXT:    por %xmm2, %xmm6
-; SSSE3-NEXT:    pand %xmm5, %xmm6
-; SSSE3-NEXT:    pandn %xmm1, %xmm5
-; SSSE3-NEXT:    por %xmm6, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    paddd %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm2
+; SSSE3-NEXT:    pandn %xmm6, %xmm2
+; SSSE3-NEXT:    psrld $1, %xmm4
+; SSSE3-NEXT:    por %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm1, %xmm4
+; SSSE3-NEXT:    pandn %xmm3, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm6
-; SSE41-NEXT:    pxor %xmm8, %xmm8
-; SSE41-NEXT:    pxor %xmm7, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm4, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm7
-; SSE41-NEXT:    paddd %xmm2, %xmm6
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT:    pandn %xmm7, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm5, %xmm7
-; SSE41-NEXT:    blendvps %xmm0, %xmm9, %xmm7
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm6
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT:    paddd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm8
-; SSE41-NEXT:    pxor %xmm8, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSE41-NEXT:    pandn %xmm0, %xmm4
-; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm9, %xmm5
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    paddd %xmm2, %xmm5
+; SSE41-NEXT:    movaps {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movaps %xmm6, %xmm7
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm8, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm4
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm1
-; SSE41-NEXT:    movaps %xmm6, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm8, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm6, %xmm2
+; SSE41-NEXT:    movaps %xmm5, %xmm0
+; SSE41-NEXT:    movaps %xmm2, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm3, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vandnps %ymm8, %ymm2, %ymm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm1, {{.*}}(%rip), %ymm3, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm5
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vblendvps %ymm5, {{.*}}(%rip), %ymm6, %ymm6
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vblendvps %ymm0, %ymm6, %ymm5, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm1
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm2
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvps %ymm2, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vblendvps %ymm0, %ymm3, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v8i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %ymm0, %ymm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1 {%k2}
-; AVX512-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm2, %k0
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm2, %k2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k2}
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512-NEXT:    retq
   %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -1084,378 +915,230 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSE2-LABEL: v16i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm9, %xmm0
 ; SSE2-NEXT:    pxor %xmm10, %xmm10
-; SSE2-NEXT:    pxor %xmm12, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm12
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT:    pxor %xmm9, %xmm12
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm9, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm12
-; SSE2-NEXT:    paddd %xmm4, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm10
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pandn %xmm9, %xmm11
+; SSE2-NEXT:    psrld $1, %xmm10
+; SSE2-NEXT:    por %xmm11, %xmm10
+; SSE2-NEXT:    pand %xmm0, %xmm10
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm10, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm11, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm9, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT:    pandn %xmm12, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm12, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
+; SSE2-NEXT:    pandn %xmm9, %xmm10
 ; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    pandn %xmm11, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm11
-; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    por %xmm10, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm11
-; SSE2-NEXT:    paddd %xmm5, %xmm8
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    pxor %xmm9, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
-; SSE2-NEXT:    pandn %xmm12, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm1, %xmm5
-; SSE2-NEXT:    pandn %xmm8, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT:    paddd %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT:    paddd %xmm6, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm8
-; SSE2-NEXT:    pxor %xmm9, %xmm8
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm8
-; SSE2-NEXT:    pandn %xmm4, %xmm8
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    pandn %xmm12, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    por %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm8, %xmm6
-; SSE2-NEXT:    pandn %xmm2, %xmm8
-; SSE2-NEXT:    por %xmm6, %xmm8
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm2
-; SSE2-NEXT:    pxor %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm9, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT:    paddd %xmm7, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm10
-; SSE2-NEXT:    pxor %xmm10, %xmm9
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm9
-; SSE2-NEXT:    pandn %xmm2, %xmm9
-; SSE2-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NEXT:    pandn %xmm12, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm10
-; SSE2-NEXT:    por %xmm2, %xmm10
-; SSE2-NEXT:    pand %xmm9, %xmm10
-; SSE2-NEXT:    pandn %xmm3, %xmm9
-; SSE2-NEXT:    por %xmm10, %xmm9
-; SSE2-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT:    paddd %xmm3, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NEXT:    pandn %xmm9, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm8
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pand %xmm3, %xmm8
+; SSE2-NEXT:    pandn %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm8, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v16i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm1, %xmm8
-; SSSE3-NEXT:    movdqa %xmm0, %xmm11
+; SSSE3-NEXT:    pxor %xmm8, %xmm8
+; SSSE3-NEXT:    pxor %xmm9, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm9
+; SSSE3-NEXT:    paddd %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm9, %xmm0
 ; SSSE3-NEXT:    pxor %xmm10, %xmm10
-; SSSE3-NEXT:    pxor %xmm12, %xmm12
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm12
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT:    pxor %xmm9, %xmm12
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm9, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm12
-; SSSE3-NEXT:    paddd %xmm4, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm10
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm10, %xmm11
+; SSSE3-NEXT:    pandn %xmm9, %xmm11
+; SSSE3-NEXT:    psrld $1, %xmm10
+; SSSE3-NEXT:    por %xmm11, %xmm10
+; SSSE3-NEXT:    pand %xmm0, %xmm10
+; SSSE3-NEXT:    pandn %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm10, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm9, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT:    pandn %xmm12, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm12 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm1
-; SSSE3-NEXT:    pandn %xmm12, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT:    paddd %xmm1, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm10
+; SSSE3-NEXT:    pandn %xmm9, %xmm10
 ; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    pand %xmm0, %xmm4
-; SSSE3-NEXT:    pandn %xmm11, %xmm0
-; SSSE3-NEXT:    por %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm11, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm11
-; SSSE3-NEXT:    pxor %xmm9, %xmm11
+; SSSE3-NEXT:    por %xmm10, %xmm4
+; SSSE3-NEXT:    pand %xmm1, %xmm4
+; SSSE3-NEXT:    pandn %xmm5, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm11
-; SSSE3-NEXT:    paddd %xmm5, %xmm8
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm1
-; SSSE3-NEXT:    pxor %xmm9, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm1
-; SSSE3-NEXT:    pandn %xmm11, %xmm1
-; SSSE3-NEXT:    movdqa %xmm5, %xmm4
-; SSSE3-NEXT:    pandn %xmm12, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm5
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm1, %xmm5
-; SSSE3-NEXT:    pandn %xmm8, %xmm1
-; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT:    paddd %xmm2, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm9, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT:    paddd %xmm6, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT:    movdqa %xmm6, %xmm8
-; SSSE3-NEXT:    pxor %xmm9, %xmm8
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm8
-; SSSE3-NEXT:    pandn %xmm4, %xmm8
-; SSSE3-NEXT:    movdqa %xmm6, %xmm4
-; SSSE3-NEXT:    pandn %xmm12, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm6
-; SSSE3-NEXT:    por %xmm4, %xmm6
-; SSSE3-NEXT:    pand %xmm8, %xmm6
-; SSSE3-NEXT:    pandn %xmm2, %xmm8
-; SSSE3-NEXT:    por %xmm6, %xmm8
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm2
-; SSSE3-NEXT:    pxor %xmm9, %xmm2
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pandn %xmm9, %xmm5
+; SSSE3-NEXT:    psrld $1, %xmm4
+; SSSE3-NEXT:    por %xmm5, %xmm4
+; SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSSE3-NEXT:    pandn %xmm6, %xmm2
+; SSSE3-NEXT:    por %xmm4, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
-; SSSE3-NEXT:    paddd %xmm7, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm10
-; SSSE3-NEXT:    pxor %xmm10, %xmm9
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm9
-; SSSE3-NEXT:    pandn %xmm2, %xmm9
-; SSSE3-NEXT:    movdqa %xmm10, %xmm2
-; SSSE3-NEXT:    pandn %xmm12, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm10
-; SSSE3-NEXT:    por %xmm2, %xmm10
-; SSSE3-NEXT:    pand %xmm9, %xmm10
-; SSSE3-NEXT:    pandn %xmm3, %xmm9
-; SSSE3-NEXT:    por %xmm10, %xmm9
-; SSSE3-NEXT:    movdqa %xmm8, %xmm2
-; SSSE3-NEXT:    movdqa %xmm9, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSSE3-NEXT:    paddd %xmm3, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT:    movdqa %xmm8, %xmm4
+; SSSE3-NEXT:    pandn %xmm9, %xmm4
+; SSSE3-NEXT:    psrld $1, %xmm8
+; SSSE3-NEXT:    por %xmm4, %xmm8
+; SSSE3-NEXT:    pand %xmm3, %xmm8
+; SSSE3-NEXT:    pandn %xmm7, %xmm3
+; SSSE3-NEXT:    por %xmm8, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v16i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm3, %xmm8
+; SSE41-NEXT:    movdqa %xmm2, %xmm12
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm9
-; SSE41-NEXT:    pxor %xmm8, %xmm8
-; SSE41-NEXT:    pxor %xmm11, %xmm11
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm11
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSE41-NEXT:    pxor %xmm10, %xmm11
-; SSE41-NEXT:    pxor %xmm12, %xmm12
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm12
-; SSE41-NEXT:    pxor %xmm10, %xmm12
-; SSE41-NEXT:    pcmpeqd %xmm12, %xmm11
 ; SSE41-NEXT:    paddd %xmm4, %xmm9
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm12, %xmm4
-; SSE41-NEXT:    pandn %xmm11, %xmm4
-; SSE41-NEXT:    movaps {{.*#+}} xmm12 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm11, %xmm13
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm13
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm13, %xmm9
-; SSE41-NEXT:    xorps %xmm13, %xmm13
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm13
-; SSE41-NEXT:    pxor %xmm10, %xmm13
-; SSE41-NEXT:    pxor %xmm14, %xmm14
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm14
-; SSE41-NEXT:    pxor %xmm10, %xmm14
-; SSE41-NEXT:    pcmpeqd %xmm14, %xmm13
-; SSE41-NEXT:    paddd %xmm5, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm14, %xmm4
-; SSE41-NEXT:    pandn %xmm13, %xmm4
-; SSE41-NEXT:    movaps %xmm11, %xmm5
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm1
-; SSE41-NEXT:    pxor %xmm13, %xmm13
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm13
-; SSE41-NEXT:    pxor %xmm10, %xmm13
-; SSE41-NEXT:    xorps %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm10, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm13
-; SSE41-NEXT:    paddd %xmm6, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT:    pandn %xmm13, %xmm4
-; SSE41-NEXT:    movaps %xmm11, %xmm5
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm5
+; SSE41-NEXT:    movaps {{.*#+}} xmm11 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    movaps {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movaps %xmm10, %xmm2
+; SSE41-NEXT:    movdqa %xmm9, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm9
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    paddd %xmm5, %xmm4
+; SSE41-NEXT:    movaps %xmm10, %xmm2
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT:    paddd %xmm7, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm8
-; SSE41-NEXT:    pxor %xmm8, %xmm10
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm10
-; SSE41-NEXT:    pandn %xmm0, %xmm10
+; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm12, %xmm3
+; SSE41-NEXT:    paddd %xmm6, %xmm3
+; SSE41-NEXT:    movaps %xmm10, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm12
+; SSE41-NEXT:    pxor %xmm6, %xmm12
+; SSE41-NEXT:    movdqa %xmm12, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm8, %xmm5
+; SSE41-NEXT:    paddd %xmm7, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm10
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE41-NEXT:    pxor %xmm7, %xmm8
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm11
-; SSE41-NEXT:    movdqa %xmm10, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm3
+; SSE41-NEXT:    blendvps %xmm0, %xmm10, %xmm5
 ; SSE41-NEXT:    movaps %xmm9, %xmm0
+; SSE41-NEXT:    movaps %xmm4, %xmm1
+; SSE41-NEXT:    movaps %xmm3, %xmm2
+; SSE41-NEXT:    movaps %xmm5, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v16i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm12, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm10
-; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm10, %xmm8
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm11
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm11, %xmm6, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm11, %ymm8
-; AVX1-NEXT:    vpaddd %xmm9, %xmm7, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm12, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm10, %xmm10
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm12, %xmm2
-; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm4, %ymm4
-; AVX1-NEXT:    vandnps %ymm8, %ymm4, %ymm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm7
 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm10 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm7, %ymm8, %ymm10, %ymm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm4, %ymm7, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm12, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm6, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm12, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm11
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm12, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm11, %xmm7, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm11, %ymm9
-; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm11
-; AVX1-NEXT:    vpcmpgtd %xmm11, %xmm12, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm12, %xmm3
-; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT:    vandnps %ymm9, %ymm2, %ymm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vblendvps %ymm3, %ymm8, %ymm10, %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm1, %ymm1
-; AVX1-NEXT:    vblendvps %ymm2, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm9 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vblendvps %ymm7, %ymm8, %ymm9, %ymm10
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvps %ymm0, %ymm10, %ymm7, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm6
+; AVX1-NEXT:    vblendvps %ymm6, %ymm8, %ymm9, %ymm7
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvps %ymm1, %ymm7, %ymm6, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v16i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm2
-; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm8
-; AVX2-NEXT:    vpcmpeqd %ymm8, %ymm7, %ymm7
-; AVX2-NEXT:    vpandn %ymm5, %ymm7, %ymm5
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm7 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm8 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm2, %ymm7, %ymm8, %ymm2
-; AVX2-NEXT:    vblendvps %ymm5, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm2
-; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm5
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm3
-; AVX2-NEXT:    vpxor %ymm6, %ymm3, %ymm4
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vpandn %ymm2, %ymm4, %ymm2
-; AVX2-NEXT:    vblendvps %ymm3, %ymm7, %ymm8, %ymm3
-; AVX2-NEXT:    vblendvps %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm4
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vblendvps %ymm4, %ymm5, %ymm6, %ymm7
+; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvps %ymm0, %ymm7, %ymm4, %ymm0
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvps %ymm2, %ymm5, %ymm6, %ymm4
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vblendvps %ymm1, %ymm4, %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %zmm0, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1 {%k2}
-; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k0
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k2}
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
@@ -1465,152 +1148,120 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 ; SSE2-LABEL: v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm1
 ; SSE2-NEXT:    pxor %xmm5, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2]
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm2
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i64:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
 ; SSSE3-NEXT:    paddq %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
 ; SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT:    pxor %xmm5, %xmm4
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm1
 ; SSSE3-NEXT:    pxor %xmm5, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSSE3-NEXT:    pxor %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm1
-; SSSE3-NEXT:    pandn %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm2
-; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm4
-; SSSE3-NEXT:    por %xmm2, %xmm4
-; SSSE3-NEXT:    pand %xmm1, %xmm4
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm1, %xmm3
 ; SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm1, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm5, %xmm6
+; SSE41-NEXT:    por %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm4
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm5
-; SSE41-NEXT:    pxor %xmm1, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm6, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pand %xmm5, %xmm0
 ; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm1
-; SSE41-NEXT:    pandn %xmm4, %xmm1
 ; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
 ; SSE41-NEXT:    blendvpd %xmm0, {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -1620,56 +1271,36 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 ;
 ; AVX1-LABEL: v2i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vblendvpd %xmm2, {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm5, %xmm2
-; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vblendvpd %xmm2, {{.*}}(%rip), %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v2i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %xmm0, %xmm2, %k2
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm2, %k0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm2, %k2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; AVX512-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %z
@@ -1678,369 +1309,279 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-LABEL: v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE2-NEXT:    paddq %xmm2, %xmm9
-; SSE2-NEXT:    pxor %xmm10, %xmm2
-; SSE2-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    pxor %xmm10, %xmm0
-; SSE2-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,0,3,2]
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    movdqa %xmm9, %xmm0
-; SSE2-NEXT:    pxor %xmm10, %xmm0
-; SSE2-NEXT:    movdqa %xmm10, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    paddq %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm7
-; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2]
-; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm9, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NEXT:    pandn %xmm4, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    pandn %xmm11, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm8, %xmm5
-; SSE2-NEXT:    por %xmm2, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pandn %xmm9, %xmm0
-; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
 ; SSE2-NEXT:    paddq %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm10, %xmm3
-; SSE2-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm6, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm2
-; SSE2-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm6, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm10, %xmm2
-; SSE2-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT:    pand %xmm6, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2]
-; SSE2-NEXT:    pand %xmm6, %xmm2
-; SSE2-NEXT:    pandn %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    pandn %xmm11, %xmm3
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm9, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm9
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSSE3-NEXT:    paddq %xmm2, %xmm9
-; SSSE3-NEXT:    pxor %xmm10, %xmm2
-; SSSE3-NEXT:    movdqa %xmm10, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSSE3-NEXT:    pxor %xmm6, %xmm7
-; SSSE3-NEXT:    pxor %xmm10, %xmm0
-; SSSE3-NEXT:    movdqa %xmm10, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm4, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm4
-; SSSE3-NEXT:    movdqa %xmm9, %xmm0
-; SSSE3-NEXT:    pxor %xmm10, %xmm0
-; SSSE3-NEXT:    movdqa %xmm10, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    paddq %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    movdqa %xmm0, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm5, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm0, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm7
-; SSSE3-NEXT:    pxor %xmm6, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm8, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm5, %xmm0
+; SSSE3-NEXT:    movdqa %xmm8, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pandn %xmm9, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSSE3-NEXT:    pand %xmm7, %xmm2
+; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
 ; SSSE3-NEXT:    pandn %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    movdqa %xmm5, %xmm2
-; SSSE3-NEXT:    pandn %xmm11, %xmm2
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm8, %xmm5
-; SSSE3-NEXT:    por %xmm2, %xmm5
-; SSSE3-NEXT:    pand %xmm0, %xmm5
-; SSSE3-NEXT:    pandn %xmm9, %xmm0
-; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
 ; SSSE3-NEXT:    paddq %xmm3, %xmm1
-; SSSE3-NEXT:    pxor %xmm10, %xmm3
-; SSSE3-NEXT:    movdqa %xmm10, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm6, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm2
-; SSSE3-NEXT:    movdqa %xmm10, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm8, %xmm3
+; SSSE3-NEXT:    movdqa %xmm8, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm3
-; SSSE3-NEXT:    pxor %xmm6, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    pxor %xmm10, %xmm2
-; SSSE3-NEXT:    movdqa %xmm10, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT:    pand %xmm6, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm8, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm4, %xmm6
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm6, %xmm2
-; SSSE3-NEXT:    pandn %xmm5, %xmm2
-; SSSE3-NEXT:    movdqa %xmm4, %xmm3
-; SSSE3-NEXT:    pandn %xmm11, %xmm3
-; SSSE3-NEXT:    pand %xmm8, %xmm4
-; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pandn %xmm9, %xmm4
+; SSSE3-NEXT:    pand %xmm7, %xmm3
+; SSSE3-NEXT:    por %xmm4, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSSE3-NEXT:    por %xmm4, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm9
-; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE41-NEXT:    paddq %xmm2, %xmm9
-; SSE41-NEXT:    pxor %xmm10, %xmm2
-; SSE41-NEXT:    movdqa %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    paddq %xmm2, %xmm8
+; SSE41-NEXT:    movdqa %xmm8, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm7, %xmm4
+; SSE41-NEXT:    por %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm7, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm7
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm4, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm7
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
 ; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm2
-; SSE41-NEXT:    pandn %xmm7, %xmm2
-; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT:    movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT:    movapd %xmm11, %xmm5
-; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSE41-NEXT:    movdqa %xmm5, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movapd %xmm6, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
 ; SSE41-NEXT:    paddq %xmm3, %xmm1
-; SSE41-NEXT:    pxor %xmm10, %xmm3
-; SSE41-NEXT:    movdqa %xmm10, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    pcmpeqq %xmm3, %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    pxor %xmm0, %xmm4
-; SSE41-NEXT:    pcmpeqq %xmm3, %xmm4
-; SSE41-NEXT:    pandn %xmm2, %xmm4
-; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm11
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm1
-; SSE41-NEXT:    movapd %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm2, %xmm7
+; SSE41-NEXT:    por %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm10, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm7, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm6
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm1
+; SSE41-NEXT:    movapd %xmm8, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqq %xmm4, %xmm7, %xmm8
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqq %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vpaddq %xmm2, %xmm6, %xmm9
-; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm3, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm2
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vandnpd %ymm8, %ymm2, %ymm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm1, {{.*}}(%rip), %ymm3, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm5
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vblendvpd %ymm5, {{.*}}(%rip), %ymm6, %ymm6
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm6, %ymm5, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm1
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm2
-; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm3, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltq %ymm2, %ymm1, %k0
-; AVX512-NEXT:    vpcmpnltq %ymm2, %ymm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpcmpnltq %ymm2, %ymm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %ymm0, %ymm2, %k2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k2}
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm2, %k0
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm2, %k2
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm0 {%k2}
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512-NEXT:    retq
   %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -2050,687 +1591,513 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
 ; SSE2-LABEL: v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm13
+; SSE2-NEXT:    movdqa %xmm0, %xmm12
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE2-NEXT:    paddq %xmm4, %xmm13
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    movdqa %xmm9, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSE2-NEXT:    pxor %xmm10, %xmm1
 ; SSE2-NEXT:    pxor %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm11, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm14
-; SSE2-NEXT:    pxor %xmm10, %xmm14
-; SSE2-NEXT:    pcmpeqd %xmm14, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[1,0,3,2]
-; SSE2-NEXT:    pand %xmm1, %xmm11
-; SSE2-NEXT:    movdqa %xmm13, %xmm0
-; SSE2-NEXT:    pxor %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm9, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm4
+; SSE2-NEXT:    movdqa %xmm9, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm14, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn %xmm11, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pandn %xmm11, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm12, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm13, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm10, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pandn %xmm12, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm1
 ; SSE2-NEXT:    paddq %xmm5, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NEXT:    pxor %xmm9, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm12
 ; SSE2-NEXT:    pxor %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm13, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm12, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm1
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm15
-; SSE2-NEXT:    pxor %xmm10, %xmm15
-; SSE2-NEXT:    pcmpeqd %xmm15, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm13
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
-; SSE2-NEXT:    pxor %xmm9, %xmm1
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm14, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm15, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2]
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pandn %xmm13, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm11, %xmm5
 ; SSE2-NEXT:    pand %xmm12, %xmm4
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm10, %xmm4
+; SSE2-NEXT:    pand %xmm11, %xmm5
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm1, %xmm5
 ; SSE2-NEXT:    pandn %xmm8, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm9, %xmm4
 ; SSE2-NEXT:    paddq %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
 ; SSE2-NEXT:    pxor %xmm9, %xmm6
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm4
+; SSE2-NEXT:    pand %xmm12, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
 ; SSE2-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm10, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2]
-; SSE2-NEXT:    pand %xmm5, %xmm8
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm8, %xmm6
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm11, %xmm5
-; SSE2-NEXT:    pand %xmm12, %xmm4
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    pandn %xmm2, %xmm6
-; SSE2-NEXT:    por %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pandn %xmm10, %xmm5
+; SSE2-NEXT:    pand %xmm11, %xmm6
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm4, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm5
 ; SSE2-NEXT:    paddq %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
 ; SSE2-NEXT:    pxor %xmm9, %xmm7
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm9, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm7, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm2
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm7, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    movdqa %xmm9, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm7
-; SSE2-NEXT:    pxor %xmm7, %xmm10
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm10[1,0,3,2]
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    pandn %xmm2, %xmm5
-; SSE2-NEXT:    movdqa %xmm7, %xmm2
-; SSE2-NEXT:    pandn %xmm11, %xmm2
-; SSE2-NEXT:    pand %xmm12, %xmm7
-; SSE2-NEXT:    por %xmm2, %xmm7
-; SSE2-NEXT:    pand %xmm5, %xmm7
-; SSE2-NEXT:    pandn %xmm3, %xmm5
-; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm6
 ; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    pandn %xmm10, %xmm2
+; SSE2-NEXT:    pand %xmm11, %xmm6
+; SSE2-NEXT:    por %xmm2, %xmm6
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm5, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i64:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm8
-; SSSE3-NEXT:    movdqa %xmm0, %xmm13
+; SSSE3-NEXT:    movdqa %xmm0, %xmm12
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSSE3-NEXT:    paddq %xmm4, %xmm13
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    movdqa %xmm9, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm10, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSSE3-NEXT:    pxor %xmm10, %xmm1
 ; SSSE3-NEXT:    pxor %xmm9, %xmm0
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    paddq %xmm4, %xmm12
+; SSSE3-NEXT:    movdqa %xmm12, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm10
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm11, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm14
-; SSSE3-NEXT:    pxor %xmm10, %xmm14
-; SSSE3-NEXT:    pcmpeqd %xmm14, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm11
-; SSSE3-NEXT:    movdqa %xmm13, %xmm0
-; SSSE3-NEXT:    pxor %xmm9, %xmm0
-; SSSE3-NEXT:    movdqa %xmm9, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm10
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
+; SSSE3-NEXT:    movdqa %xmm9, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm11, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm0
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm14, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm0
-; SSSE3-NEXT:    pandn %xmm11, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pandn %xmm11, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm12, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pandn %xmm13, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pand %xmm10, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
+; SSSE3-NEXT:    pand %xmm11, %xmm4
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    pand %xmm0, %xmm4
+; SSSE3-NEXT:    pandn %xmm12, %xmm0
+; SSSE3-NEXT:    por %xmm4, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm8, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
 ; SSSE3-NEXT:    paddq %xmm5, %xmm8
+; SSSE3-NEXT:    movdqa %xmm8, %xmm4
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm12
 ; SSSE3-NEXT:    pxor %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm13, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm1
-; SSSE3-NEXT:    movdqa %xmm9, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm15 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm15
-; SSSE3-NEXT:    pxor %xmm10, %xmm15
-; SSSE3-NEXT:    pcmpeqd %xmm15, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm13
-; SSSE3-NEXT:    movdqa %xmm8, %xmm1
-; SSSE3-NEXT:    pxor %xmm9, %xmm1
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm14, %xmm1
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm12, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm15, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm1
-; SSSE3-NEXT:    pandn %xmm13, %xmm1
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm11, %xmm5
 ; SSSE3-NEXT:    pand %xmm12, %xmm4
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pand %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pandn %xmm10, %xmm4
+; SSSE3-NEXT:    pand %xmm11, %xmm5
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm1, %xmm5
 ; SSSE3-NEXT:    pandn %xmm8, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    por %xmm5, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
 ; SSSE3-NEXT:    paddq %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm12, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm8
 ; SSSE3-NEXT:    pxor %xmm9, %xmm6
-; SSSE3-NEXT:    movdqa %xmm9, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
+; SSSE3-NEXT:    pand %xmm12, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm9, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm10, %xmm6
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm8
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm6
-; SSSE3-NEXT:    pandn %xmm8, %xmm6
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm11, %xmm5
-; SSSE3-NEXT:    pand %xmm12, %xmm4
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pand %xmm6, %xmm4
-; SSSE3-NEXT:    pandn %xmm2, %xmm6
-; SSSE3-NEXT:    por %xmm4, %xmm6
-; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pand %xmm8, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    movdqa %xmm6, %xmm5
+; SSSE3-NEXT:    pandn %xmm10, %xmm5
+; SSSE3-NEXT:    pand %xmm11, %xmm6
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    pand %xmm4, %xmm6
+; SSSE3-NEXT:    pandn %xmm2, %xmm4
+; SSSE3-NEXT:    por %xmm6, %xmm4
+; SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
 ; SSSE3-NEXT:    paddq %xmm7, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm9, %xmm2
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm6
 ; SSSE3-NEXT:    pxor %xmm9, %xmm7
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm7, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm2
-; SSSE3-NEXT:    movdqa %xmm9, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pand %xmm8, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm5
+; SSSE3-NEXT:    pxor %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm7, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    movdqa %xmm9, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm7
-; SSSE3-NEXT:    pxor %xmm7, %xmm10
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm10[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm10, %xmm5
-; SSSE3-NEXT:    pandn %xmm2, %xmm5
-; SSSE3-NEXT:    movdqa %xmm7, %xmm2
-; SSSE3-NEXT:    pandn %xmm11, %xmm2
-; SSSE3-NEXT:    pand %xmm12, %xmm7
-; SSSE3-NEXT:    por %xmm2, %xmm7
-; SSSE3-NEXT:    pand %xmm5, %xmm7
-; SSSE3-NEXT:    pandn %xmm3, %xmm5
-; SSSE3-NEXT:    por %xmm7, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm2
+; SSSE3-NEXT:    pandn %xmm10, %xmm2
+; SSSE3-NEXT:    pand %xmm11, %xmm6
+; SSSE3-NEXT:    por %xmm2, %xmm6
+; SSSE3-NEXT:    pand %xmm5, %xmm6
+; SSSE3-NEXT:    pandn %xmm3, %xmm5
+; SSSE3-NEXT:    por %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm8
-; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm0, %xmm11
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    paddq %xmm4, %xmm8
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    movdqa %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm8, %xmm10
+; SSE41-NEXT:    pxor %xmm9, %xmm10
+; SSE41-NEXT:    movdqa %xmm0, %xmm11
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm11
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm11, %xmm12
+; SSE41-NEXT:    por %xmm0, %xmm12
+; SSE41-NEXT:    pxor %xmm9, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm0
 ; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    pand %xmm9, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm12
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm9
-; SSE41-NEXT:    pxor %xmm9, %xmm12
-; SSE41-NEXT:    pxor %xmm10, %xmm11
-; SSE41-NEXT:    movdqa %xmm10, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm11, %xmm4
+; SSE41-NEXT:    por %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm12, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm11
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm11
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm11
-; SSE41-NEXT:    pxor %xmm9, %xmm11
-; SSE41-NEXT:    pcmpeqq %xmm11, %xmm12
-; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm9, %xmm4
-; SSE41-NEXT:    pcmpeqq %xmm11, %xmm4
-; SSE41-NEXT:    pandn %xmm12, %xmm4
-; SSE41-NEXT:    movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT:    movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT:    movapd %xmm11, %xmm13
-; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm13
+; SSE41-NEXT:    movdqa %xmm9, %xmm12
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm12
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm12, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT:    movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movapd %xmm10, %xmm12
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm12
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm13, %xmm8
-; SSE41-NEXT:    movdqa %xmm1, %xmm14
+; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm8
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    paddq %xmm5, %xmm1
-; SSE41-NEXT:    pxor %xmm10, %xmm5
-; SSE41-NEXT:    movdqa %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm15
-; SSE41-NEXT:    pxor %xmm9, %xmm15
-; SSE41-NEXT:    pxor %xmm10, %xmm14
-; SSE41-NEXT:    movdqa %xmm10, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm14, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm12
+; SSE41-NEXT:    pxor %xmm9, %xmm12
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm5
+; SSE41-NEXT:    pand %xmm4, %xmm13
+; SSE41-NEXT:    por %xmm0, %xmm13
 ; SSE41-NEXT:    pxor %xmm9, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm15
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm9, %xmm4
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT:    pandn %xmm15, %xmm4
-; SSE41-NEXT:    movapd %xmm11, %xmm5
-; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm5
+; SSE41-NEXT:    movdqa %xmm9, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm14, %xmm4
+; SSE41-NEXT:    por %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm13, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm13
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm13
+; SSE41-NEXT:    movdqa %xmm9, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm10, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm5
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    paddq %xmm6, %xmm2
-; SSE41-NEXT:    pxor %xmm10, %xmm6
-; SSE41-NEXT:    movdqa %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm5
-; SSE41-NEXT:    pxor %xmm9, %xmm5
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm6
-; SSE41-NEXT:    pxor %xmm9, %xmm6
-; SSE41-NEXT:    pcmpeqq %xmm6, %xmm5
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm12
+; SSE41-NEXT:    pxor %xmm9, %xmm12
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm9, %xmm4
-; SSE41-NEXT:    pcmpeqq %xmm6, %xmm4
-; SSE41-NEXT:    pandn %xmm5, %xmm4
-; SSE41-NEXT:    movapd %xmm11, %xmm5
-; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm4, %xmm5
+; SSE41-NEXT:    por %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm9, %xmm6
+; SSE41-NEXT:    movdqa %xmm9, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm4
+; SSE41-NEXT:    por %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm5
+; SSE41-NEXT:    movdqa %xmm9, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm10, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm5
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    paddq %xmm7, %xmm3
-; SSE41-NEXT:    pxor %xmm10, %xmm7
-; SSE41-NEXT:    movdqa %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm4
-; SSE41-NEXT:    pxor %xmm9, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
 ; SSE41-NEXT:    pxor %xmm9, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm10, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    pxor %xmm0, %xmm9
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm9
-; SSE41-NEXT:    pandn %xmm4, %xmm9
-; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm11
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    por %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm9, %xmm7
 ; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm4
+; SSE41-NEXT:    por %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm6, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm9, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm10
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm3
 ; SSE41-NEXT:    movapd %xmm8, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm12, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm10
-; AVX1-NEXT:    vpcmpeqq %xmm8, %xmm10, %xmm8
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm11
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqq %xmm11, %xmm6, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm11, %ymm8
-; AVX1-NEXT:    vpaddq %xmm9, %xmm7, %xmm9
-; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm12, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm4
-; AVX1-NEXT:    vpcmpeqq %xmm4, %xmm10, %xmm10
-; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm12, %xmm2
-; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpeqq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm4, %ymm4
-; AVX1-NEXT:    vandnpd %ymm8, %ymm4, %ymm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm7
 ; AVX1-NEXT:    vmovapd {{.*#+}} ymm8 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm10 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm7, %ymm8, %ymm10, %ymm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm4, %ymm7, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm12, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqq %xmm7, %xmm6, %xmm9
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm12, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm11
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm12, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqq %xmm11, %xmm7, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm11, %ymm9
-; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm11
-; AVX1-NEXT:    vpcmpgtq %xmm11, %xmm12, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm12, %xmm3
-; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm5
-; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT:    vandnpd %ymm9, %ymm2, %ymm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm3, %ymm8, %ymm10, %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm1, %ymm1
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm9 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vblendvpd %ymm7, %ymm8, %ymm9, %ymm10
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorpd %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm10, %ymm7, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm6
+; AVX1-NEXT:    vblendvpd %ymm6, %ymm8, %ymm9, %ymm7
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorpd %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm7, %ymm6, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpcmpeqq %ymm5, %ymm7, %ymm5
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm4, %ymm2
-; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm8
-; AVX2-NEXT:    vpcmpeqq %ymm8, %ymm7, %ymm7
-; AVX2-NEXT:    vpandn %ymm5, %ymm7, %ymm5
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm7 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm7, %ymm8, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm2
-; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm4, %ymm5
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm4, %ymm3
-; AVX2-NEXT:    vpxor %ymm6, %ymm3, %ymm4
-; AVX2-NEXT:    vpcmpeqq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vpandn %ymm2, %ymm4, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm7, %ymm8, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm4
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm6, %ymm7
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm7, %ymm4, %ymm0
+; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm6, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm4, %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltq %zmm2, %zmm1, %k0
-; AVX512-NEXT:    vpcmpnltq %zmm2, %zmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpcmpnltq %zmm2, %zmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %zmm0, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1 {%k2}
-; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm2, %k0
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm2, %k2
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k2}
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z

Modified: llvm/trunk/test/CodeGen/X86/ssub_sat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ssub_sat.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/ssub_sat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/ssub_sat.ll Mon Sep 30 00:58:50 2019
@@ -183,30 +183,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x
 ;
 ; X64-LABEL: vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa %xmm0, %xmm2
-; X64-NEXT:    pxor %xmm3, %xmm3
-; X64-NEXT:    pxor %xmm0, %xmm0
-; X64-NEXT:    pcmpgtd %xmm1, %xmm0
-; X64-NEXT:    pcmpeqd %xmm4, %xmm4
-; X64-NEXT:    pxor %xmm4, %xmm0
-; X64-NEXT:    pxor %xmm5, %xmm5
-; X64-NEXT:    pcmpgtd %xmm2, %xmm5
-; X64-NEXT:    pxor %xmm4, %xmm5
-; X64-NEXT:    pcmpeqd %xmm5, %xmm0
-; X64-NEXT:    psubd %xmm1, %xmm2
-; X64-NEXT:    pcmpgtd %xmm2, %xmm3
-; X64-NEXT:    movdqa %xmm3, %xmm1
-; X64-NEXT:    pxor %xmm4, %xmm1
-; X64-NEXT:    pcmpeqd %xmm5, %xmm1
-; X64-NEXT:    pxor %xmm4, %xmm1
-; X64-NEXT:    pandn %xmm1, %xmm0
-; X64-NEXT:    movdqa %xmm3, %xmm1
+; X64-NEXT:    pxor %xmm2, %xmm2
+; X64-NEXT:    movdqa %xmm0, %xmm3
+; X64-NEXT:    psubd %xmm1, %xmm3
+; X64-NEXT:    pcmpgtd %xmm2, %xmm1
+; X64-NEXT:    pcmpgtd %xmm3, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    pcmpgtd %xmm3, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm1
 ; X64-NEXT:    pandn {{.*}}(%rip), %xmm1
-; X64-NEXT:    psrld $1, %xmm3
-; X64-NEXT:    por %xmm1, %xmm3
-; X64-NEXT:    pand %xmm0, %xmm3
-; X64-NEXT:    pandn %xmm2, %xmm0
-; X64-NEXT:    por %xmm3, %xmm0
+; X64-NEXT:    psrld $1, %xmm2
+; X64-NEXT:    por %xmm1, %xmm2
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    pandn %xmm3, %xmm0
+; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
   %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp

Modified: llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll Mon Sep 30 00:58:50 2019
@@ -598,141 +598,94 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
 define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-LABEL: v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    psubd %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    pandn %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT:    psubd %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm1
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    psubd %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    psrld $1, %xmm3
-; SSSE3-NEXT:    por %xmm1, %xmm3
-; SSSE3-NEXT:    pand %xmm0, %xmm3
-; SSSE3-NEXT:    pandn %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    psrld $1, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm3, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm4, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm1
-; SSE41-NEXT:    pandn %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psubd %xmm1, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    blendvps %xmm0, {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vblendvps %xmm1, {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v2i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX2-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %z
@@ -741,141 +694,94 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2
 define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-LABEL: v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    psubd %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    pandn %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT:    psubd %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm1
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    psubd %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    psrld $1, %xmm3
-; SSSE3-NEXT:    por %xmm1, %xmm3
-; SSSE3-NEXT:    pand %xmm0, %xmm3
-; SSSE3-NEXT:    pandn %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    psrld $1, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm3, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm4, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm1
-; SSE41-NEXT:    pandn %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psubd %xmm1, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    blendvps %xmm0, {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vblendvps %xmm1, {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX2-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v4i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %z
@@ -884,226 +790,144 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm8
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm0
-; SSE2-NEXT:    psubd %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm2
-; SSE2-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSE2-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NEXT:    pandn %xmm2, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm6, %xmm2
-; SSE2-NEXT:    pandn %xmm7, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    por %xmm2, %xmm6
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    pandn %xmm4, %xmm0
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm8, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT:    psubd %xmm3, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
-; SSE2-NEXT:    pandn %xmm7, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    por %xmm3, %xmm5
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psubd %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pandn %xmm6, %xmm7
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm5, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psubd %xmm3, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm6, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm8
-; SSSE3-NEXT:    pxor %xmm8, %xmm0
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT:    pxor %xmm8, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm0
-; SSSE3-NEXT:    psubd %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    movdqa %xmm6, %xmm2
-; SSSE3-NEXT:    pxor %xmm8, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSSE3-NEXT:    pxor %xmm8, %xmm2
-; SSSE3-NEXT:    pandn %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm6, %xmm2
-; SSSE3-NEXT:    pandn %xmm7, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm6
-; SSSE3-NEXT:    por %xmm2, %xmm6
-; SSSE3-NEXT:    pand %xmm0, %xmm6
-; SSSE3-NEXT:    pandn %xmm4, %xmm0
-; SSSE3-NEXT:    por %xmm6, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    pxor %xmm8, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
-; SSSE3-NEXT:    psubd %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm3
-; SSSE3-NEXT:    pxor %xmm8, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm3
-; SSSE3-NEXT:    pxor %xmm8, %xmm3
-; SSSE3-NEXT:    pandn %xmm3, %xmm2
-; SSSE3-NEXT:    movdqa %xmm5, %xmm3
-; SSSE3-NEXT:    pandn %xmm7, %xmm3
-; SSSE3-NEXT:    psrld $1, %xmm5
-; SSSE3-NEXT:    por %xmm3, %xmm5
-; SSSE3-NEXT:    pand %xmm2, %xmm5
-; SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSSE3-NEXT:    por %xmm5, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    psubd %xmm2, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm7
+; SSSE3-NEXT:    pandn %xmm6, %xmm7
+; SSSE3-NEXT:    psrld $1, %xmm2
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm5, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    psubd %xmm3, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm3
+; SSSE3-NEXT:    pandn %xmm6, %xmm3
+; SSSE3-NEXT:    psrld $1, %xmm4
+; SSSE3-NEXT:    por %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm1, %xmm4
+; SSSE3-NEXT:    pandn %xmm2, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm8, %xmm8
 ; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm6, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm8
-; SSE41-NEXT:    pxor %xmm8, %xmm6
-; SSE41-NEXT:    pxor %xmm7, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pxor %xmm8, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
 ; SSE41-NEXT:    psubd %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm8, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSE41-NEXT:    pxor %xmm8, %xmm2
-; SSE41-NEXT:    pandn %xmm2, %xmm6
-; SSE41-NEXT:    movaps {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm7, %xmm2
-; SSE41-NEXT:    blendvps %xmm0, %xmm9, %xmm2
-; SSE41-NEXT:    movdqa %xmm6, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm5
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm8, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm8, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT:    psubd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm3
-; SSE41-NEXT:    pxor %xmm8, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm8, %xmm3
-; SSE41-NEXT:    pandn %xmm3, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm4
+; SSE41-NEXT:    movaps {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movaps %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm2
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm9, %xmm7
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psubd %xmm3, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm6
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm6, %xmm2
 ; SSE41-NEXT:    movaps %xmm5, %xmm0
+; SSE41-NEXT:    movaps %xmm2, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vpsubd %xmm2, %xmm6, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm3, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vandnps %ymm2, %ymm8, %ymm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm1, {{.*}}(%rip), %ymm3, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vblendvps %ymm1, {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm1
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm2
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v8i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %ymm0, %ymm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1 {%k2}
-; AVX512-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm2, %k2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k2}
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512-NEXT:    retq
   %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -1112,399 +936,244 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSE2-LABEL: v16i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm12
-; SSE2-NEXT:    pxor %xmm9, %xmm9
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSE2-NEXT:    pxor %xmm10, %xmm0
-; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm11
-; SSE2-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT:    psubd %xmm4, %xmm12
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm11, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pandn %xmm4, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pandn %xmm11, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm12, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT:    pxor %xmm10, %xmm1
-; SSE2-NEXT:    pxor %xmm12, %xmm12
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm12
-; SSE2-NEXT:    pxor %xmm10, %xmm12
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm1
-; SSE2-NEXT:    psubd %xmm5, %xmm8
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pandn %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm11, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pandn %xmm8, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm8
-; SSE2-NEXT:    pxor %xmm10, %xmm8
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm8
-; SSE2-NEXT:    psubd %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    psubd %xmm4, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm10, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm10, %xmm6
-; SSE2-NEXT:    pandn %xmm6, %xmm8
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm11, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pandn %xmm10, %xmm11
 ; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    pandn %xmm2, %xmm8
-; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm11, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pandn %xmm9, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psubd %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm1
 ; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm10, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm5
-; SSE2-NEXT:    psubd %xmm7, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pandn %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NEXT:    pandn %xmm11, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    por %xmm2, %xmm9
-; SSE2-NEXT:    pand %xmm5, %xmm9
-; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    pandn %xmm10, %xmm9
+; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    por %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm6, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm10, %xmm6
+; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psubd %xmm7, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm7, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NEXT:    pandn %xmm10, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm8
+; SSE2-NEXT:    por %xmm5, %xmm8
+; SSE2-NEXT:    pand %xmm3, %xmm8
+; SSE2-NEXT:    pandn %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm8, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v16i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm1, %xmm8
-; SSSE3-NEXT:    movdqa %xmm0, %xmm12
-; SSSE3-NEXT:    pxor %xmm9, %xmm9
-; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSSE3-NEXT:    pxor %xmm10, %xmm0
-; SSSE3-NEXT:    pxor %xmm11, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm11
-; SSSE3-NEXT:    pxor %xmm10, %xmm11
-; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT:    psubd %xmm4, %xmm12
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pandn %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pandn %xmm11, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pandn %xmm12, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
-; SSSE3-NEXT:    pxor %xmm10, %xmm1
-; SSSE3-NEXT:    pxor %xmm12, %xmm12
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm12
-; SSSE3-NEXT:    pxor %xmm10, %xmm12
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm1
-; SSSE3-NEXT:    psubd %xmm5, %xmm8
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pandn %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm11, %xmm5
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pand %xmm1, %xmm4
-; SSSE3-NEXT:    pandn %xmm8, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
 ; SSSE3-NEXT:    pxor %xmm8, %xmm8
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm8
-; SSSE3-NEXT:    pxor %xmm10, %xmm8
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm8
-; SSSE3-NEXT:    psubd %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm9
+; SSSE3-NEXT:    psubd %xmm4, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm10, %xmm6
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm10, %xmm6
-; SSSE3-NEXT:    pandn %xmm6, %xmm8
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm11, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm11
+; SSSE3-NEXT:    pandn %xmm10, %xmm11
 ; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pand %xmm8, %xmm4
-; SSSE3-NEXT:    pandn %xmm2, %xmm8
-; SSSE3-NEXT:    por %xmm4, %xmm8
+; SSSE3-NEXT:    por %xmm11, %xmm4
+; SSSE3-NEXT:    pand %xmm0, %xmm4
+; SSSE3-NEXT:    pandn %xmm9, %xmm0
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    psubd %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm5, %xmm1
 ; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    pxor %xmm10, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm5
-; SSSE3-NEXT:    psubd %xmm7, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm9
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pandn %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa %xmm9, %xmm2
-; SSSE3-NEXT:    pandn %xmm11, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm9
-; SSSE3-NEXT:    por %xmm2, %xmm9
-; SSSE3-NEXT:    pand %xmm5, %xmm9
-; SSSE3-NEXT:    pandn %xmm3, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm9
+; SSSE3-NEXT:    pandn %xmm10, %xmm9
+; SSSE3-NEXT:    psrld $1, %xmm5
 ; SSSE3-NEXT:    por %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm8, %xmm2
-; SSSE3-NEXT:    movdqa %xmm5, %xmm3
+; SSSE3-NEXT:    pand %xmm1, %xmm5
+; SSSE3-NEXT:    pandn %xmm4, %xmm1
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    psubd %xmm6, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT:    pxor %xmm6, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pandn %xmm10, %xmm6
+; SSSE3-NEXT:    psrld $1, %xmm5
+; SSSE3-NEXT:    por %xmm6, %xmm5
+; SSSE3-NEXT:    pand %xmm2, %xmm5
+; SSSE3-NEXT:    pandn %xmm4, %xmm2
+; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    psubd %xmm7, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm7, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm8
+; SSSE3-NEXT:    movdqa %xmm8, %xmm5
+; SSSE3-NEXT:    pandn %xmm10, %xmm5
+; SSSE3-NEXT:    psrld $1, %xmm8
+; SSSE3-NEXT:    por %xmm5, %xmm8
+; SSSE3-NEXT:    pand %xmm3, %xmm8
+; SSSE3-NEXT:    pandn %xmm4, %xmm3
+; SSSE3-NEXT:    por %xmm8, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v16i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm9
-; SSE41-NEXT:    pxor %xmm8, %xmm8
+; SSE41-NEXT:    movdqa %xmm3, %xmm8
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
 ; SSE41-NEXT:    pxor %xmm10, %xmm10
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm10
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm11
-; SSE41-NEXT:    pxor %xmm11, %xmm10
-; SSE41-NEXT:    pxor %xmm12, %xmm12
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm12
-; SSE41-NEXT:    pxor %xmm11, %xmm12
-; SSE41-NEXT:    pcmpeqd %xmm12, %xmm10
+; SSE41-NEXT:    movdqa %xmm0, %xmm9
 ; SSE41-NEXT:    psubd %xmm4, %xmm9
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm11, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm12, %xmm4
-; SSE41-NEXT:    pxor %xmm11, %xmm4
-; SSE41-NEXT:    pandn %xmm4, %xmm10
-; SSE41-NEXT:    movaps {{.*#+}} xmm13 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm12 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm12, %xmm4
-; SSE41-NEXT:    blendvps %xmm0, %xmm13, %xmm4
-; SSE41-NEXT:    movdqa %xmm10, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    movaps {{.*#+}} xmm12 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT:    movaps {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movaps %xmm11, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm9
-; SSE41-NEXT:    xorps %xmm4, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pxor %xmm11, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm10
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm10
-; SSE41-NEXT:    pxor %xmm11, %xmm10
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm4
-; SSE41-NEXT:    psubd %xmm5, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm11, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT:    pxor %xmm11, %xmm5
-; SSE41-NEXT:    pandn %xmm5, %xmm4
-; SSE41-NEXT:    movaps %xmm12, %xmm5
-; SSE41-NEXT:    blendvps %xmm0, %xmm13, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT:    pxor %xmm11, %xmm4
-; SSE41-NEXT:    xorps %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm11, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT:    psubd %xmm6, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm6
-; SSE41-NEXT:    pxor %xmm11, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE41-NEXT:    pxor %xmm11, %xmm6
-; SSE41-NEXT:    pandn %xmm6, %xmm4
-; SSE41-NEXT:    movaps %xmm12, %xmm5
-; SSE41-NEXT:    blendvps %xmm0, %xmm13, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psubd %xmm5, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm5, %xmm1
+; SSE41-NEXT:    movaps %xmm11, %xmm3
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT:    pxor %xmm11, %xmm4
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm11, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    psubd %xmm7, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm8
+; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psubd %xmm6, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm6, %xmm2
+; SSE41-NEXT:    movaps %xmm11, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm8, %xmm5
-; SSE41-NEXT:    pxor %xmm11, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm11, %xmm5
-; SSE41-NEXT:    pandn %xmm5, %xmm4
+; SSE41-NEXT:    psubd %xmm7, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE41-NEXT:    pxor %xmm7, %xmm8
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm11
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm13, %xmm12
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm3
+; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm5
 ; SSE41-NEXT:    movaps %xmm9, %xmm0
+; SSE41-NEXT:    movaps %xmm4, %xmm1
+; SSE41-NEXT:    movaps %xmm3, %xmm2
+; SSE41-NEXT:    movaps %xmm5, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v16i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT:    vpxor %xmm10, %xmm10, %xmm10
-; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm10, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm4, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm10, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm12
-; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm12, %xmm8
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm10, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm11
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm10, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm11, %xmm5, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm11, %ymm8
-; AVX1-NEXT:    vpsubd %xmm9, %xmm7, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm10, %xmm7
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm11
-; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm10, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm5, %ymm5
-; AVX1-NEXT:    vandnps %ymm5, %ymm8, %ymm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm7
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm11 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm7, %ymm8, %ymm11, %ymm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm5, %ymm7, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm10, %xmm7
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm10, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm6, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm10, %xmm7
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm12
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm10, %xmm7
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm12, %xmm7, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm12, %ymm9
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm12
-; AVX1-NEXT:    vpcmpgtd %xmm12, %xmm10, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm10, %xmm3
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT:    vandnps %ymm2, %ymm9, %ymm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX1-NEXT:    vblendvps %ymm3, %ymm8, %ymm11, %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm1, %ymm1
-; AVX1-NEXT:    vblendvps %ymm2, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsubd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm6, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vblendvps %ymm2, %ymm4, %ymm6, %ymm7
+; AVX1-NEXT:    vblendvps %ymm0, %ymm7, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpsubd %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm5, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvps %ymm2, %ymm4, %ymm6, %ymm3
+; AVX1-NEXT:    vblendvps %ymm1, %ymm3, %ymm2, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v16i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
-; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm2
-; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm8
-; AVX2-NEXT:    vpcmpeqd %ymm8, %ymm7, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpandn %ymm7, %ymm5, %ymm5
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm7 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm8 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm2, %ymm7, %ymm8, %ymm2
-; AVX2-NEXT:    vblendvps %ymm5, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm2
-; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm5
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm3
-; AVX2-NEXT:    vpxor %ymm6, %ymm3, %ymm4
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vpxor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpandn %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvps %ymm3, %ymm7, %ymm8, %ymm3
-; AVX2-NEXT:    vblendvps %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm2, %ymm5
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vblendvps %ymm2, %ymm5, %ymm6, %ymm7
+; AVX2-NEXT:    vblendvps %ymm0, %ymm7, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm3, %ymm2
+; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vblendvps %ymm3, %ymm5, %ymm6, %ymm2
+; AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %zmm0, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1 {%k2}
-; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k2}
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
@@ -1514,50 +1183,38 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 ; SSE2-LABEL: v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,0,3,2]
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NEXT:    pandn %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE2-NEXT:    por %xmm2, %xmm3
@@ -1570,50 +1227,38 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 ; SSSE3-LABEL: v2i64:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
 ; SSSE3-NEXT:    psubq %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT:    pxor %xmm5, %xmm4
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm6
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm1
+; SSSE3-NEXT:    pxor %xmm5, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm5, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm2, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm4
-; SSSE3-NEXT:    pandn %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm2
 ; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSSE3-NEXT:    por %xmm2, %xmm3
@@ -1626,46 +1271,32 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 ; SSE41-LABEL: v2i64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm3, %xmm0
 ; SSE41-NEXT:    psubq %xmm1, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
 ; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm5
-; SSE41-NEXT:    pxor %xmm4, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm5, %xmm6
+; SSE41-NEXT:    por %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm6, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pand %xmm5, %xmm0
 ; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    pandn %xmm3, %xmm1
 ; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
 ; SSE41-NEXT:    blendvpd %xmm0, {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -1676,57 +1307,39 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 ; AVX1-LABEL: v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vblendvpd %xmm1, {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm5, %xmm2
-; AVX2-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vblendvpd %xmm1, {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v2i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %xmm0, %xmm2, %k2
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtq %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm2, %k2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; AVX512-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %z
@@ -1735,381 +1348,285 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-LABEL: v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE2-NEXT:    psubq %xmm2, %xmm10
-; SSE2-NEXT:    pxor %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm7, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT:    pxor %xmm9, %xmm7
-; SSE2-NEXT:    pxor %xmm5, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm9, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2]
-; SSE2-NEXT:    pand %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    movdqa %xmm5, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm9, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2]
-; SSE2-NEXT:    pand %xmm6, %xmm2
-; SSE2-NEXT:    pxor %xmm9, %xmm2
-; SSE2-NEXT:    pandn %xmm2, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    pandn %xmm8, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm11, %xmm4
-; SSE2-NEXT:    por %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    pandn %xmm10, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm9, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
 ; SSE2-NEXT:    psubq %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm3
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm9, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm9, %xmm3
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
-; SSE2-NEXT:    pandn %xmm8, %xmm3
-; SSE2-NEXT:    pand %xmm11, %xmm5
-; SSE2-NEXT:    por %xmm3, %xmm5
-; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm9, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm10
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSSE3-NEXT:    psubq %xmm2, %xmm10
-; SSSE3-NEXT:    pxor %xmm5, %xmm2
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    psubq %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    movdqa %xmm0, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm7, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT:    pxor %xmm9, %xmm7
-; SSSE3-NEXT:    pxor %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa %xmm5, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm9, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm7, %xmm0
-; SSSE3-NEXT:    movdqa %xmm10, %xmm6
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    movdqa %xmm5, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm5, %xmm0
+; SSSE3-NEXT:    movdqa %xmm8, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm4, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm9, %xmm6
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm6, %xmm2
-; SSSE3-NEXT:    pxor %xmm9, %xmm2
-; SSSE3-NEXT:    pandn %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm2
-; SSSE3-NEXT:    pandn %xmm8, %xmm2
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm11, %xmm4
-; SSSE3-NEXT:    por %xmm2, %xmm4
-; SSSE3-NEXT:    pand %xmm0, %xmm4
-; SSSE3-NEXT:    pandn %xmm10, %xmm0
-; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    pand %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pandn %xmm9, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSSE3-NEXT:    pand %xmm7, %xmm2
+; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
 ; SSSE3-NEXT:    psubq %xmm3, %xmm1
-; SSSE3-NEXT:    pxor %xmm5, %xmm3
-; SSSE3-NEXT:    movdqa %xmm5, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm2
-; SSSE3-NEXT:    movdqa %xmm5, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm8, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm3
-; SSSE3-NEXT:    pxor %xmm9, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm4
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm8, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm3
-; SSSE3-NEXT:    pxor %xmm9, %xmm3
-; SSSE3-NEXT:    pandn %xmm3, %xmm2
-; SSSE3-NEXT:    movdqa %xmm5, %xmm3
-; SSSE3-NEXT:    pandn %xmm8, %xmm3
-; SSSE3-NEXT:    pand %xmm11, %xmm5
-; SSSE3-NEXT:    por %xmm3, %xmm5
-; SSSE3-NEXT:    pand %xmm2, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pandn %xmm9, %xmm4
+; SSSE3-NEXT:    pand %xmm7, %xmm3
+; SSSE3-NEXT:    por %xmm4, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm9
-; SSE41-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
-; SSE41-NEXT:    psubq %xmm2, %xmm9
-; SSE41-NEXT:    pxor %xmm11, %xmm2
-; SSE41-NEXT:    movdqa %xmm11, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    psubq %xmm2, %xmm8
+; SSE41-NEXT:    movdqa %xmm8, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
 ; SSE41-NEXT:    pand %xmm7, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSE41-NEXT:    pxor %xmm10, %xmm2
-; SSE41-NEXT:    pxor %xmm11, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    por %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pcmpeqq %xmm4, %xmm2
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pxor %xmm11, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm10, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm4, %xmm5
-; SSE41-NEXT:    pxor %xmm10, %xmm5
-; SSE41-NEXT:    pandn %xmm5, %xmm2
-; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT:    movapd %xmm7, %xmm4
-; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm9
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psubq %xmm3, %xmm1
-; SSE41-NEXT:    pxor %xmm11, %xmm3
-; SSE41-NEXT:    movdqa %xmm11, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm10, %xmm2
-; SSE41-NEXT:    pxor %xmm11, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSE41-NEXT:    movdqa %xmm5, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
 ; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm10, %xmm3
-; SSE41-NEXT:    pcmpeqq %xmm3, %xmm2
+; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movapd %xmm6, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm11, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pcmpeqq %xmm3, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pandn %xmm4, %xmm2
-; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    psubq %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm2, %xmm7
+; SSE41-NEXT:    por %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm10, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm7, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm6
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT:    movapd %xmm9, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm1
+; SSE41-NEXT:    movapd %xmm8, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqq %xmm4, %xmm7, %xmm8
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqq %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vpsubq %xmm2, %xmm6, %xmm9
-; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm3, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm2
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vandnpd %ymm2, %ymm8, %ymm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm1, {{.*}}(%rip), %ymm3, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorpd %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vblendvpd %ymm1, {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm1
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm2
-; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltq %ymm2, %ymm1, %k0
-; AVX512-NEXT:    vpcmpnltq %ymm2, %ymm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpcmpnltq %ymm2, %ymm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %ymm0, %ymm2, %k2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k2}
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512-NEXT:    vpcmpgtq %ymm2, %ymm1, %k0
+; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm2, %k2
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm0 {%k2}
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512-NEXT:    retq
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -2119,414 +1636,324 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
 ; SSE2-LABEL: v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm13
+; SSE2-NEXT:    movdqa %xmm0, %xmm12
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE2-NEXT:    psubq %xmm4, %xmm13
+; SSE2-NEXT:    pxor %xmm9, %xmm0
+; SSE2-NEXT:    psubq %xmm4, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm10
 ; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    movdqa %xmm9, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSE2-NEXT:    pxor %xmm10, %xmm1
-; SSE2-NEXT:    pxor %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pand %xmm11, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm11, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm15
-; SSE2-NEXT:    pxor %xmm10, %xmm15
-; SSE2-NEXT:    pcmpeqd %xmm15, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm13, %xmm1
-; SSE2-NEXT:    pxor %xmm9, %xmm1
-; SSE2-NEXT:    movdqa %xmm9, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm14
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
-; SSE2-NEXT:    por %xmm14, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm11
-; SSE2-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NEXT:    pcmpeqd %xmm15, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[1,0,3,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm10, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
 ; SSE2-NEXT:    pand %xmm11, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pandn %xmm4, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pandn %xmm11, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm12, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm13, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pandn %xmm12, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm1
 ; SSE2-NEXT:    psubq %xmm5, %xmm8
-; SSE2-NEXT:    pxor %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm1
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NEXT:    pxor %xmm9, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm13, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    movdqa %xmm9, %xmm13
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm14, %xmm15
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[1,1,3,3]
-; SSE2-NEXT:    por %xmm15, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm13
-; SSE2-NEXT:    pxor %xmm10, %xmm13
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm13, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pandn %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm11, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm12, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm12, %xmm4
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm10, %xmm4
+; SSE2-NEXT:    pand %xmm11, %xmm5
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm1, %xmm5
 ; SSE2-NEXT:    pandn %xmm8, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm9, %xmm4
 ; SSE2-NEXT:    psubq %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
 ; SSE2-NEXT:    pxor %xmm9, %xmm6
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm4
+; SSE2-NEXT:    pand %xmm12, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
 ; SSE2-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm10, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2]
-; SSE2-NEXT:    pand %xmm5, %xmm8
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm10, %xmm6
-; SSE2-NEXT:    pandn %xmm6, %xmm8
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm11, %xmm5
-; SSE2-NEXT:    pand %xmm12, %xmm4
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    pandn %xmm2, %xmm8
-; SSE2-NEXT:    por %xmm4, %xmm8
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    psubq %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm9, %xmm7
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pandn %xmm10, %xmm5
+; SSE2-NEXT:    pand %xmm11, %xmm6
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm4, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm4
 ; SSE2-NEXT:    por %xmm6, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm2
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm10, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    psubq %xmm7, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    pxor %xmm9, %xmm2
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm9, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm7, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm10, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,0,3,2]
-; SSE2-NEXT:    pand %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm10, %xmm6
-; SSE2-NEXT:    pandn %xmm6, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    pandn %xmm11, %xmm2
-; SSE2-NEXT:    pand %xmm12, %xmm4
-; SSE2-NEXT:    por %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    pandn %xmm10, %xmm2
+; SSE2-NEXT:    pand %xmm11, %xmm6
+; SSE2-NEXT:    por %xmm2, %xmm6
+; SSE2-NEXT:    pand %xmm5, %xmm6
 ; SSE2-NEXT:    pandn %xmm3, %xmm5
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm5, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i64:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm8
-; SSSE3-NEXT:    movdqa %xmm0, %xmm13
+; SSSE3-NEXT:    movdqa %xmm0, %xmm12
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSSE3-NEXT:    psubq %xmm4, %xmm13
+; SSSE3-NEXT:    pxor %xmm9, %xmm0
+; SSSE3-NEXT:    psubq %xmm4, %xmm12
+; SSSE3-NEXT:    movdqa %xmm12, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm10
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm11, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm10
 ; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    movdqa %xmm9, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm10, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSSE3-NEXT:    pxor %xmm10, %xmm1
-; SSSE3-NEXT:    pxor %xmm9, %xmm0
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pand %xmm11, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm11, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm15
-; SSSE3-NEXT:    pxor %xmm10, %xmm15
-; SSSE3-NEXT:    pcmpeqd %xmm15, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm13, %xmm1
-; SSSE3-NEXT:    pxor %xmm9, %xmm1
-; SSSE3-NEXT:    movdqa %xmm9, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm14
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
-; SSSE3-NEXT:    por %xmm14, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm11
-; SSSE3-NEXT:    pxor %xmm10, %xmm11
-; SSSE3-NEXT:    pcmpeqd %xmm15, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[1,0,3,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
 ; SSSE3-NEXT:    pand %xmm11, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pandn %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pandn %xmm11, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm12, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pandn %xmm13, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    pand %xmm0, %xmm4
+; SSSE3-NEXT:    pandn %xmm12, %xmm0
+; SSSE3-NEXT:    por %xmm4, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm8, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
 ; SSSE3-NEXT:    psubq %xmm5, %xmm8
+; SSSE3-NEXT:    movdqa %xmm8, %xmm4
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm12
 ; SSSE3-NEXT:    pxor %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm13, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm1
-; SSSE3-NEXT:    movdqa %xmm9, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm8, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    movdqa %xmm9, %xmm13
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm12, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm14, %xmm15
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[1,1,3,3]
-; SSSE3-NEXT:    por %xmm15, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm13
-; SSSE3-NEXT:    pxor %xmm10, %xmm13
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm13, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pandn %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm11, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm12, %xmm4
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pand %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pandn %xmm10, %xmm4
+; SSSE3-NEXT:    pand %xmm11, %xmm5
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm1, %xmm5
 ; SSSE3-NEXT:    pandn %xmm8, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    por %xmm5, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
 ; SSSE3-NEXT:    psubq %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm12, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm8
 ; SSSE3-NEXT:    pxor %xmm9, %xmm6
-; SSSE3-NEXT:    movdqa %xmm9, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm6, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
+; SSSE3-NEXT:    pand %xmm12, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm9, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm10, %xmm6
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm8
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm10, %xmm6
-; SSSE3-NEXT:    pandn %xmm6, %xmm8
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm11, %xmm5
-; SSSE3-NEXT:    pand %xmm12, %xmm4
-; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pand %xmm8, %xmm4
-; SSSE3-NEXT:    pandn %xmm2, %xmm8
-; SSSE3-NEXT:    por %xmm4, %xmm8
-; SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSSE3-NEXT:    psubq %xmm7, %xmm3
-; SSSE3-NEXT:    pxor %xmm9, %xmm7
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    movdqa %xmm6, %xmm5
+; SSSE3-NEXT:    pandn %xmm10, %xmm5
+; SSSE3-NEXT:    pand %xmm11, %xmm6
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    pand %xmm4, %xmm6
+; SSSE3-NEXT:    pandn %xmm2, %xmm4
 ; SSSE3-NEXT:    por %xmm6, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm2
-; SSSE3-NEXT:    movdqa %xmm9, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm10, %xmm6
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
+; SSSE3-NEXT:    psubq %xmm7, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSSE3-NEXT:    pxor %xmm9, %xmm2
-; SSSE3-NEXT:    movdqa %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    pxor %xmm9, %xmm7
+; SSSE3-NEXT:    movdqa %xmm7, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm5
+; SSSE3-NEXT:    pxor %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm7, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm2
-; SSSE3-NEXT:    pxor %xmm10, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm10, %xmm6
-; SSSE3-NEXT:    pandn %xmm6, %xmm5
-; SSSE3-NEXT:    movdqa %xmm4, %xmm2
-; SSSE3-NEXT:    pandn %xmm11, %xmm2
-; SSSE3-NEXT:    pand %xmm12, %xmm4
-; SSSE3-NEXT:    por %xmm2, %xmm4
-; SSSE3-NEXT:    pand %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm6
+; SSSE3-NEXT:    movdqa %xmm6, %xmm2
+; SSSE3-NEXT:    pandn %xmm10, %xmm2
+; SSSE3-NEXT:    pand %xmm11, %xmm6
+; SSSE3-NEXT:    por %xmm2, %xmm6
+; SSSE3-NEXT:    pand %xmm5, %xmm6
 ; SSSE3-NEXT:    pandn %xmm3, %xmm5
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm3
 ; SSSE3-NEXT:    retq
 ;
@@ -2534,293 +1961,198 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm8
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm0, %xmm11
+; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    psubq %xmm4, %xmm8
+; SSE41-NEXT:    movdqa %xmm8, %xmm10
+; SSE41-NEXT:    pxor %xmm9, %xmm10
+; SSE41-NEXT:    movdqa %xmm0, %xmm11
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm11
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm11, %xmm12
+; SSE41-NEXT:    por %xmm0, %xmm12
 ; SSE41-NEXT:    pxor %xmm9, %xmm4
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm12
-; SSE41-NEXT:    pshufd {{.*#+}} xmm15 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm12, %xmm15
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSE41-NEXT:    pxor %xmm10, %xmm15
-; SSE41-NEXT:    pxor %xmm9, %xmm11
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm11
-; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
-; SSE41-NEXT:    pand %xmm12, %xmm11
-; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm11, %xmm12
-; SSE41-NEXT:    pxor %xmm10, %xmm12
-; SSE41-NEXT:    pcmpeqq %xmm12, %xmm15
-; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm11, %xmm4
+; SSE41-NEXT:    por %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm12, %xmm4
 ; SSE41-NEXT:    movdqa %xmm9, %xmm11
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm11
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pcmpeqq %xmm12, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pandn %xmm4, %xmm15
-; SSE41-NEXT:    movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT:    movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT:    movapd %xmm11, %xmm4
-; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm4
-; SSE41-NEXT:    movdqa %xmm15, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm11
+; SSE41-NEXT:    movdqa %xmm9, %xmm12
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm12
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm12, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT:    movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movapd %xmm10, %xmm12
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm12
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm8
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    psubq %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm12
+; SSE41-NEXT:    pxor %xmm9, %xmm12
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm4, %xmm13
+; SSE41-NEXT:    por %xmm0, %xmm13
 ; SSE41-NEXT:    pxor %xmm9, %xmm5
-; SSE41-NEXT:    movdqa %xmm9, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm14, %xmm4
+; SSE41-NEXT:    por %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm13, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm13
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm13
 ; SSE41-NEXT:    movdqa %xmm9, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pand %xmm13, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm14
-; SSE41-NEXT:    pxor %xmm10, %xmm14
-; SSE41-NEXT:    pcmpeqq %xmm14, %xmm4
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm9, %xmm13
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm13
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm15 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm15
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[1,1,3,3]
-; SSE41-NEXT:    por %xmm15, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm10, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm14, %xmm5
-; SSE41-NEXT:    pxor %xmm10, %xmm5
-; SSE41-NEXT:    pandn %xmm5, %xmm4
-; SSE41-NEXT:    movapd %xmm11, %xmm5
-; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm5
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm10, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm5
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    psubq %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm12
+; SSE41-NEXT:    pxor %xmm9, %xmm12
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm4, %xmm5
+; SSE41-NEXT:    por %xmm0, %xmm5
 ; SSE41-NEXT:    pxor %xmm9, %xmm6
-; SSE41-NEXT:    movdqa %xmm9, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm4
+; SSE41-NEXT:    por %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm4
 ; SSE41-NEXT:    movdqa %xmm9, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm10, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm5
 ; SSE41-NEXT:    movdqa %xmm9, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm6
-; SSE41-NEXT:    pxor %xmm10, %xmm6
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm6
-; SSE41-NEXT:    pxor %xmm10, %xmm6
-; SSE41-NEXT:    pandn %xmm6, %xmm4
-; SSE41-NEXT:    movapd %xmm11, %xmm5
-; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm12, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm10, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm5
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    psubq %xmm7, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    pxor %xmm9, %xmm5
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    por %xmm0, %xmm6
 ; SSE41-NEXT:    pxor %xmm9, %xmm7
-; SSE41-NEXT:    movdqa %xmm9, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm9, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm10, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm4
+; SSE41-NEXT:    por %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm6, %xmm4
 ; SSE41-NEXT:    movdqa %xmm9, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm6
-; SSE41-NEXT:    pxor %xmm10, %xmm6
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm6
-; SSE41-NEXT:    pxor %xmm10, %xmm6
-; SSE41-NEXT:    pandn %xmm6, %xmm4
-; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm11
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm9, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm10
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm3
 ; SSE41-NEXT:    movapd %xmm8, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT:    vpxor %xmm10, %xmm10, %xmm10
-; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm10, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm4, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm10, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm12
-; AVX1-NEXT:    vpcmpeqq %xmm8, %xmm12, %xmm8
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm10, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm11
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm10, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqq %xmm11, %xmm5, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm11, %ymm8
-; AVX1-NEXT:    vpsubq %xmm9, %xmm7, %xmm9
-; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm10, %xmm7
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm6
-; AVX1-NEXT:    vpcmpeqq %xmm6, %xmm12, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm11
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm10, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm6
-; AVX1-NEXT:    vpcmpeqq %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm5, %ymm5
-; AVX1-NEXT:    vandnpd %ymm5, %ymm8, %ymm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm7
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm8 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm11 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm7, %ymm8, %ymm11, %ymm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm5, %ymm7, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm10, %xmm7
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm10, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqq %xmm7, %xmm6, %xmm9
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm10, %xmm7
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm12
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm10, %xmm7
-; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqq %xmm12, %xmm7, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm12, %ymm9
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm12
-; AVX1-NEXT:    vpcmpgtq %xmm12, %xmm10, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm10, %xmm3
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm6
-; AVX1-NEXT:    vpcmpeqq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT:    vandnpd %ymm2, %ymm9, %ymm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm3, %ymm8, %ymm11, %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm1, %ymm1
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsubq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; AVX1-NEXT:    vxorpd %ymm0, %ymm6, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm4 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm4, %ymm6, %ymm7
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm7, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpsubq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
+; AVX1-NEXT:    vxorpd %ymm1, %ymm5, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm4, %ymm6, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpcmpeqq %ymm5, %ymm7, %ymm5
-; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm4, %ymm2
-; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm8
-; AVX2-NEXT:    vpcmpeqq %ymm8, %ymm7, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpandn %ymm7, %ymm5, %ymm5
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm7 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm7, %ymm8, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm2
-; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm4, %ymm5
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm4, %ymm3
-; AVX2-NEXT:    vpxor %ymm6, %ymm3, %ymm4
-; AVX2-NEXT:    vpcmpeqq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vpxor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpandn %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm7, %ymm8, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm5
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm6, %ymm7
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm7, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm2
+; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm6, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltq %zmm2, %zmm1, %k0
-; AVX512-NEXT:    vpcmpnltq %zmm2, %zmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpcmpnltq %zmm2, %zmm0, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %zmm0, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1 {%k2}
-; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vpcmpgtq %zmm2, %zmm1, %k0
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm2, %k2
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k2}
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z

Modified: llvm/trunk/test/CodeGen/X86/vec_saddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_saddo.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_saddo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_saddo.ll Mon Sep 30 00:58:50 2019
@@ -52,67 +52,40 @@ define <2 x i32> @saddo_v2i32(<2 x i32>
 ; SSE-LABEL: saddo_v2i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE-NEXT:    pxor %xmm4, %xmm3
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE-NEXT:    pxor %xmm4, %xmm5
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE-NEXT:    paddd %xmm1, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm4, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE-NEXT:    pandn %xmm3, %xmm2
-; SSE-NEXT:    movq %xmm0, (%rdi)
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movq %xmm1, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: saddo_v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vmovq %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: saddo_v2i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vmovq %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: saddo_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovq %xmm1, (%rdi)
@@ -129,82 +102,45 @@ define <3 x i32> @saddo_v3i32(<3 x i32>
 ; SSE2-LABEL: saddo_v3i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    movq %xmm0, (%rdi)
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, 8(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movq %xmm1, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, 8(%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: saddo_v3i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT:    pandn %xmm3, %xmm2
-; SSSE3-NEXT:    movq %xmm0, (%rdi)
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm0, 8(%rdi)
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movq %xmm1, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: saddo_v3i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm4, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT:    paddd %xmm1, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT:    pandn %xmm3, %xmm2
-; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
-; SSE41-NEXT:    movq %xmm0, (%rdi)
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pextrd $2, %xmm1, 8(%rdi)
+; SSE41-NEXT:    movq %xmm1, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: saddo_v3i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
 ; AVX1-NEXT:    vmovq %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
@@ -212,17 +148,10 @@ define <3 x i32> @saddo_v3i32(<3 x i32>
 ; AVX2-LABEL: saddo_v3i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
 ; AVX2-NEXT:    vmovq %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
@@ -230,13 +159,10 @@ define <3 x i32> @saddo_v3i32(<3 x i32>
 ; AVX512-LABEL: saddo_v3i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
@@ -254,67 +180,40 @@ define <4 x i32> @saddo_v4i32(<4 x i32>
 ; SSE-LABEL: saddo_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE-NEXT:    pxor %xmm4, %xmm3
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE-NEXT:    pxor %xmm4, %xmm5
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE-NEXT:    paddd %xmm1, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm4, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE-NEXT:    pandn %xmm3, %xmm2
-; SSE-NEXT:    movdqa %xmm0, (%rdi)
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm1, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: saddo_v4i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: saddo_v4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: saddo_v4i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
@@ -331,164 +230,118 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
 ; SSE2-LABEL: saddo_v6i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
 ; SSE2-NEXT:    movd %r8d, %xmm0
 ; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movd %edx, %xmm0
-; SSE2-NEXT:    movd %esi, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT:    movd %esi, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    movd %r9d, %xmm3
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movd %r9d, %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT:    pxor %xmm5, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm4
-; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    paddd %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
 ; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT:    pxor %xmm5, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSE2-NEXT:    pandn %xmm6, %xmm2
-; SSE2-NEXT:    movq %xmm1, 16(%rcx)
-; SSE2-NEXT:    movdqa %xmm0, (%rcx)
-; SSE2-NEXT:    movq %xmm2, 16(%rdi)
-; SSE2-NEXT:    movdqa %xmm4, (%rdi)
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    movq %xmm2, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    movq %xmm5, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm6, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: saddo_v6i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movq %rdi, %rax
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
 ; SSSE3-NEXT:    movd %r8d, %xmm0
 ; SSSE3-NEXT:    movd %ecx, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    movd %esi, %xmm4
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSSE3-NEXT:    movd %esi, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT:    movd %r9d, %xmm3
-; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    movd %r9d, %xmm0
+; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT:    pxor %xmm5, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSSE3-NEXT:    paddd %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm4
-; SSSE3-NEXT:    pandn %xmm6, %xmm4
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    paddd %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
 ; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT:    pxor %xmm5, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSSE3-NEXT:    paddd %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSSE3-NEXT:    pandn %xmm6, %xmm2
-; SSSE3-NEXT:    movq %xmm1, 16(%rcx)
-; SSSE3-NEXT:    movdqa %xmm0, (%rcx)
-; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
-; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    paddd %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT:    pxor %xmm0, %xmm5
+; SSSE3-NEXT:    movq %xmm2, 16(%rcx)
+; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
+; SSSE3-NEXT:    movq %xmm5, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm6, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: saddo_v6i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movq %rdi, %rax
-; SSE41-NEXT:    movd %esi, %xmm4
-; SSE41-NEXT:    pinsrd $1, %edx, %xmm4
-; SSE41-NEXT:    pinsrd $2, %ecx, %xmm4
-; SSE41-NEXT:    pinsrd $3, %r8d, %xmm4
-; SSE41-NEXT:    movd %r9d, %xmm2
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
-; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    movd %esi, %xmm1
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm1
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm1
+; SSE41-NEXT:    movd %r9d, %xmm0
 ; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
+; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pxor %xmm6, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    pxor %xmm7, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE41-NEXT:    paddd %xmm4, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT:    pxor %xmm5, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm4
-; SSE41-NEXT:    pandn %xmm6, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    paddd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm5, %xmm5
 ; SSE41-NEXT:    pxor %xmm6, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    pxor %xmm7, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE41-NEXT:    paddd %xmm2, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm5, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm3
-; SSE41-NEXT:    pandn %xmm6, %xmm3
-; SSE41-NEXT:    movq %xmm0, 16(%rcx)
-; SSE41-NEXT:    movdqa %xmm1, (%rcx)
-; SSE41-NEXT:    movq %xmm3, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm4, (%rdi)
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm1, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE41-NEXT:    paddd %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movq %xmm2, 16(%rcx)
+; SSE41-NEXT:    movdqa %xmm4, (%rcx)
+; SSE41-NEXT:    movq %xmm0, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm6, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: saddo_v6i32:
@@ -496,28 +349,15 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT:    vandnps %ymm8, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
@@ -525,17 +365,10 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
 ; AVX2-LABEL: saddo_v6i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
 ; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
@@ -544,13 +377,10 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
 ; AVX512-LABEL: saddo_v6i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm2, %k0
 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
@@ -568,37 +398,18 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
 define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
 ; SSE-LABEL: saddo_v8i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm1, %xmm4
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    pxor %xmm6, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE-NEXT:    pxor %xmm5, %xmm6
-; SSE-NEXT:    pxor %xmm7, %xmm7
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE-NEXT:    pxor %xmm5, %xmm7
-; SSE-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE-NEXT:    paddd %xmm2, %xmm0
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm5, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSE-NEXT:    pandn %xmm6, %xmm2
-; SSE-NEXT:    pxor %xmm6, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE-NEXT:    pxor %xmm5, %xmm6
-; SSE-NEXT:    pxor %xmm7, %xmm7
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm7
-; SSE-NEXT:    pxor %xmm5, %xmm7
-; SSE-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE-NEXT:    paddd %xmm3, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE-NEXT:    pxor %xmm5, %xmm1
-; SSE-NEXT:    pcmpeqd %xmm7, %xmm1
-; SSE-NEXT:    pandn %xmm6, %xmm1
-; SSE-NEXT:    movdqa %xmm4, 16(%rdi)
-; SSE-NEXT:    movdqa %xmm0, (%rdi)
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE-NEXT:    paddd %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE-NEXT:    paddd %xmm1, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm4, %xmm1
+; SSE-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm2, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: saddo_v8i32:
@@ -606,28 +417,15 @@ define <8 x i32> @saddo_v8i32(<8 x i32>
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT:    vandnps %ymm8, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
@@ -635,30 +433,20 @@ define <8 x i32> @saddo_v8i32(<8 x i32>
 ; AVX2-LABEL: saddo_v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: saddo_v8i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm2, %k0
 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
@@ -674,132 +462,70 @@ define <8 x i32> @saddo_v8i32(<8 x i32>
 define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
 ; SSE-LABEL: saddo_v16i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm3, %xmm8
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    pxor %xmm11, %xmm11
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm11
-; SSE-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSE-NEXT:    pxor %xmm10, %xmm11
-; SSE-NEXT:    pxor %xmm12, %xmm12
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm12
-; SSE-NEXT:    pxor %xmm10, %xmm12
-; SSE-NEXT:    pcmpeqd %xmm12, %xmm11
-; SSE-NEXT:    paddd %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm8, %xmm8
 ; SSE-NEXT:    pxor %xmm9, %xmm9
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE-NEXT:    pxor %xmm10, %xmm9
-; SSE-NEXT:    pcmpeqd %xmm12, %xmm9
-; SSE-NEXT:    pandn %xmm11, %xmm9
-; SSE-NEXT:    pxor %xmm12, %xmm12
-; SSE-NEXT:    pcmpgtd %xmm5, %xmm12
-; SSE-NEXT:    pxor %xmm10, %xmm12
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE-NEXT:    pxor %xmm10, %xmm4
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm12
-; SSE-NEXT:    paddd %xmm5, %xmm1
-; SSE-NEXT:    pxor %xmm11, %xmm11
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm11
-; SSE-NEXT:    pxor %xmm10, %xmm11
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm11
-; SSE-NEXT:    pandn %xmm12, %xmm11
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm6, %xmm4
-; SSE-NEXT:    pxor %xmm10, %xmm4
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE-NEXT:    pxor %xmm10, %xmm5
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE-NEXT:    paddd %xmm6, %xmm2
-; SSE-NEXT:    pxor %xmm6, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE-NEXT:    pxor %xmm10, %xmm6
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE-NEXT:    pandn %xmm4, %xmm6
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm7, %xmm4
-; SSE-NEXT:    pxor %xmm10, %xmm4
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE-NEXT:    pxor %xmm10, %xmm5
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE-NEXT:    paddd %xmm7, %xmm8
-; SSE-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSE-NEXT:    pxor %xmm10, %xmm3
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE-NEXT:    pandn %xmm4, %xmm3
-; SSE-NEXT:    movdqa %xmm8, 48(%rdi)
-; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
-; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
-; SSE-NEXT:    movdqa %xmm0, (%rdi)
-; SSE-NEXT:    movdqa %xmm9, %xmm0
-; SSE-NEXT:    movdqa %xmm11, %xmm1
-; SSE-NEXT:    movdqa %xmm6, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm9
+; SSE-NEXT:    paddd %xmm0, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm9, %xmm0
+; SSE-NEXT:    pxor %xmm9, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm9
+; SSE-NEXT:    paddd %xmm1, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm9, %xmm1
+; SSE-NEXT:    pxor %xmm9, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm9
+; SSE-NEXT:    paddd %xmm2, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm9, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm7, %xmm8
+; SSE-NEXT:    paddd %xmm3, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm8, %xmm3
+; SSE-NEXT:    movdqa %xmm7, 48(%rdi)
+; SSE-NEXT:    movdqa %xmm6, 32(%rdi)
+; SSE-NEXT:    movdqa %xmm5, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm4, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: saddo_v16i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
 ; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm5, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm8
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm5, %xmm4
-; AVX1-NEXT:    vpxor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm4, %xmm8
-; AVX1-NEXT:    vpaddd %xmm9, %xmm7, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm5, %xmm7
-; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpandn %xmm8, %xmm4, %xmm8
+; AVX1-NEXT:    vpaddd %xmm4, %xmm7, %xmm8
+; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm5, %xmm7
-; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm4
-; AVX1-NEXT:    vpxor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm4, %xmm7
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm10
-; AVX1-NEXT:    vpcmpgtd %xmm10, %xmm5, %xmm1
-; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpandn %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpackssdw %xmm8, %xmm1, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm7
-; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm3
-; AVX1-NEXT:    vpxor %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm3, %xmm7
-; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm1
-; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpandn %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm3
-; AVX1-NEXT:    vpxor %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm5, %xmm7
-; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpaddd %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm0
-; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm8, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
 ; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT:    vmovdqa %xmm9, 48(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm10, 32(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm4, 16(%rdi)
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT:    vmovdqa %xmm8, 48(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm6, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX1-NEXT:    retq
 ;
@@ -807,28 +533,15 @@ define <16 x i32> @saddo_v16i32(<16 x i3
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
 ; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm1
-; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm7, %ymm1
-; AVX2-NEXT:    vpandn %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm5, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm5
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm4
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm0
-; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm7, %ymm0
-; AVX2-NEXT:    vpandn %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm1
@@ -842,13 +555,10 @@ define <16 x i32> @saddo_v16i32(<16 x i3
 ; AVX512-LABEL: saddo_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k0
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
@@ -1094,157 +804,42 @@ define <8 x i32> @saddo_v8i16(<8 x i16>
 }
 
 define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
-; SSE2-LABEL: saddo_v2i64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    paddq %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm5
-; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, (%rdi)
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: saddo_v2i64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    paddq %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm5
-; SSSE3-NEXT:    pxor %xmm1, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pandn %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: saddo_v2i64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    paddq %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm4
-; SSE41-NEXT:    pxor %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm5
-; SSE41-NEXT:    pxor %xmm1, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT:    movdqa %xmm0, (%rdi)
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm2
-; SSE41-NEXT:    pandn %xmm4, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: saddo_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    paddq %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: saddo_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
@@ -1252,17 +847,10 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
 ; AVX2-LABEL: saddo_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0
-; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
@@ -1270,13 +858,10 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
 ; AVX512-LABEL: saddo_v2i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm2, %k0
 ; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)

Modified: llvm/trunk/test/CodeGen/X86/vec_ssubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_ssubo.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_ssubo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_ssubo.ll Mon Sep 30 00:58:50 2019
@@ -51,71 +51,42 @@ define <1 x i32> @ssubo_v1i32(<1 x i32>
 define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
 ; SSE-LABEL: ssubo_v2i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm3, %xmm3
 ; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE-NEXT:    pxor %xmm4, %xmm2
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE-NEXT:    pxor %xmm4, %xmm5
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE-NEXT:    psubd %xmm1, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm3
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm3
-; SSE-NEXT:    pandn %xmm3, %xmm2
-; SSE-NEXT:    movq %xmm0, (%rdi)
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psubd %xmm1, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm3, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vmovq %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ssubo_v2i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vmovq %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: ssubo_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
 ; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovq %xmm1, (%rdi)
@@ -131,87 +102,49 @@ define <2 x i32> @ssubo_v2i32(<2 x i32>
 define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
 ; SSE2-LABEL: ssubo_v3i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    movq %xmm0, (%rdi)
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, 8(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movq %xmm3, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, 8(%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: ssubo_v3i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT:    psubd %xmm1, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    pandn %xmm3, %xmm2
-; SSSE3-NEXT:    movq %xmm0, (%rdi)
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm0, 8(%rdi)
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    psubd %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    movq %xmm3, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ssubo_v3i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm4, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT:    psubd %xmm1, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    pandn %xmm3, %xmm2
-; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
-; SSE41-NEXT:    movq %xmm0, (%rdi)
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psubd %xmm1, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrd $2, %xmm3, 8(%rdi)
+; SSE41-NEXT:    movq %xmm3, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v3i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
 ; AVX1-NEXT:    vmovq %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
@@ -219,18 +152,10 @@ define <3 x i32> @ssubo_v3i32(<3 x i32>
 ; AVX2-LABEL: ssubo_v3i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
 ; AVX2-NEXT:    vmovq %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
@@ -238,13 +163,10 @@ define <3 x i32> @ssubo_v3i32(<3 x i32>
 ; AVX512-LABEL: ssubo_v3i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
 ; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
@@ -261,71 +183,42 @@ define <3 x i32> @ssubo_v3i32(<3 x i32>
 define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
 ; SSE-LABEL: ssubo_v4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm3, %xmm3
 ; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE-NEXT:    pxor %xmm4, %xmm2
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE-NEXT:    pxor %xmm4, %xmm5
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE-NEXT:    psubd %xmm1, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm3
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm3
-; SSE-NEXT:    pandn %xmm3, %xmm2
-; SSE-NEXT:    movdqa %xmm0, (%rdi)
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psubd %xmm1, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm3, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v4i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ssubo_v4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: ssubo_v4i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
 ; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
@@ -342,201 +235,132 @@ define <6 x i32> @ssubo_v6i32(<6 x i32>
 ; SSE2-LABEL: ssubo_v6i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    movd %r8d, %xmm0
-; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movd %edx, %xmm2
-; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movd %r8d, %xmm1
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    movd %esi, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
 ; SSE2-NEXT:    movd %r9d, %xmm1
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm5, %xmm2
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE2-NEXT:    pxor %xmm5, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSE2-NEXT:    psubd %xmm6, %xmm0
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm6, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
-; SSE2-NEXT:    pxor %xmm5, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE2-NEXT:    psubd %xmm4, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm5, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm5, %xmm3
-; SSE2-NEXT:    pandn %xmm3, %xmm6
-; SSE2-NEXT:    movq %xmm1, 16(%rcx)
-; SSE2-NEXT:    movdqa %xmm0, (%rcx)
-; SSE2-NEXT:    movq %xmm6, 16(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, (%rdi)
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psubd %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psubd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movq %xmm3, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    movq %xmm2, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: ssubo_v6i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movq %rdi, %rax
-; SSSE3-NEXT:    movd %r8d, %xmm0
-; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    movd %edx, %xmm2
-; SSSE3-NEXT:    movd %esi, %xmm0
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movd %r8d, %xmm1
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    movd %edx, %xmm1
+; SSSE3-NEXT:    movd %esi, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
 ; SSSE3-NEXT:    movd %r9d, %xmm1
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT:    pxor %xmm5, %xmm2
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSSE3-NEXT:    pxor %xmm5, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSSE3-NEXT:    psubd %xmm6, %xmm0
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    pandn %xmm6, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm7
-; SSSE3-NEXT:    pxor %xmm5, %xmm7
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSSE3-NEXT:    psubd %xmm4, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT:    pxor %xmm5, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm3
-; SSSE3-NEXT:    pxor %xmm5, %xmm3
-; SSSE3-NEXT:    pandn %xmm3, %xmm6
-; SSSE3-NEXT:    movq %xmm1, 16(%rcx)
-; SSSE3-NEXT:    movdqa %xmm0, (%rcx)
-; SSSE3-NEXT:    movq %xmm6, 16(%rdi)
-; SSSE3-NEXT:    movdqa %xmm2, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    psubd %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    psubd %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    movq %xmm3, 16(%rcx)
+; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
+; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ssubo_v6i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movq %rdi, %rax
-; SSE41-NEXT:    movd %esi, %xmm0
-; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
-; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
-; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
-; SSE41-NEXT:    movd %r9d, %xmm1
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    movd %esi, %xmm1
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm1
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm1
+; SSE41-NEXT:    movd %r9d, %xmm0
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
 ; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
-; SSE41-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm6
-; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm6
-; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm6
+; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT:    pxor %xmm5, %xmm2
-; SSE41-NEXT:    pxor %xmm7, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSE41-NEXT:    psubd %xmm6, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    pandn %xmm6, %xmm2
-; SSE41-NEXT:    pxor %xmm6, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    pxor %xmm7, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
-; SSE41-NEXT:    psubd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT:    pxor %xmm5, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm4
-; SSE41-NEXT:    pxor %xmm5, %xmm4
-; SSE41-NEXT:    pandn %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psubd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psubd %xmm2, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm0
 ; SSE41-NEXT:    movq %xmm1, 16(%rcx)
-; SSE41-NEXT:    movdqa %xmm0, (%rcx)
-; SSE41-NEXT:    movq %xmm6, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm2, (%rdi)
+; SSE41-NEXT:    movdqa %xmm4, (%rcx)
+; SSE41-NEXT:    movq %xmm0, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm3, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v6i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vpsubd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT:    vandnps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
@@ -544,18 +368,10 @@ define <6 x i32> @ssubo_v6i32(<6 x i32>
 ; AVX2-LABEL: ssubo_v6i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpandn %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
 ; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
@@ -564,13 +380,10 @@ define <6 x i32> @ssubo_v6i32(<6 x i32>
 ; AVX512-LABEL: ssubo_v6i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
 ; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
@@ -588,70 +401,35 @@ define <6 x i32> @ssubo_v6i32(<6 x i32>
 define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
 ; SSE-LABEL: ssubo_v8i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm5, %xmm5
 ; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSE-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE-NEXT:    pxor %xmm6, %xmm4
-; SSE-NEXT:    pxor %xmm7, %xmm7
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE-NEXT:    pxor %xmm6, %xmm7
-; SSE-NEXT:    pcmpeqd %xmm7, %xmm4
-; SSE-NEXT:    psubd %xmm2, %xmm0
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm6, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSE-NEXT:    pxor %xmm6, %xmm2
-; SSE-NEXT:    pandn %xmm2, %xmm4
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE-NEXT:    pxor %xmm6, %xmm2
-; SSE-NEXT:    pxor %xmm7, %xmm7
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm7
-; SSE-NEXT:    pxor %xmm6, %xmm7
-; SSE-NEXT:    pcmpeqd %xmm7, %xmm2
-; SSE-NEXT:    psubd %xmm3, %xmm1
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE-NEXT:    pxor %xmm6, %xmm5
-; SSE-NEXT:    pcmpeqd %xmm7, %xmm5
-; SSE-NEXT:    pxor %xmm6, %xmm5
-; SSE-NEXT:    pandn %xmm5, %xmm2
-; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
-; SSE-NEXT:    movdqa %xmm0, (%rdi)
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    psubd %xmm2, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psubd %xmm3, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm5, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vpsubd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT:    vandnps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
@@ -659,31 +437,20 @@ define <8 x i32> @ssubo_v8i32(<8 x i32>
 ; AVX2-LABEL: ssubo_v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpandn %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: ssubo_v8i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
 ; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
@@ -699,128 +466,59 @@ define <8 x i32> @ssubo_v8i32(<8 x i32>
 define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
 ; SSE-LABEL: ssubo_v16i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm10, %xmm10
-; SSE-NEXT:    pxor %xmm8, %xmm8
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm8
-; SSE-NEXT:    pcmpeqd %xmm11, %xmm11
-; SSE-NEXT:    pxor %xmm11, %xmm8
-; SSE-NEXT:    pxor %xmm9, %xmm9
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm9
-; SSE-NEXT:    pxor %xmm11, %xmm9
-; SSE-NEXT:    pcmpeqd %xmm9, %xmm8
-; SSE-NEXT:    psubd %xmm4, %xmm0
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE-NEXT:    pxor %xmm11, %xmm4
-; SSE-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSE-NEXT:    pxor %xmm11, %xmm4
-; SSE-NEXT:    pandn %xmm4, %xmm8
 ; SSE-NEXT:    pxor %xmm9, %xmm9
-; SSE-NEXT:    pcmpgtd %xmm5, %xmm9
-; SSE-NEXT:    pxor %xmm11, %xmm9
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE-NEXT:    pxor %xmm11, %xmm4
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm9
-; SSE-NEXT:    psubd %xmm5, %xmm1
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE-NEXT:    pxor %xmm11, %xmm5
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE-NEXT:    pxor %xmm11, %xmm5
-; SSE-NEXT:    pandn %xmm5, %xmm9
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm6, %xmm4
-; SSE-NEXT:    pxor %xmm11, %xmm4
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE-NEXT:    pxor %xmm11, %xmm5
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE-NEXT:    psubd %xmm6, %xmm2
-; SSE-NEXT:    pxor %xmm6, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE-NEXT:    pxor %xmm11, %xmm6
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE-NEXT:    pxor %xmm11, %xmm6
-; SSE-NEXT:    pandn %xmm6, %xmm4
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm7, %xmm5
-; SSE-NEXT:    pxor %xmm11, %xmm5
-; SSE-NEXT:    pxor %xmm6, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE-NEXT:    pxor %xmm11, %xmm6
-; SSE-NEXT:    pcmpeqd %xmm6, %xmm5
-; SSE-NEXT:    psubd %xmm7, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm3, %xmm10
-; SSE-NEXT:    pxor %xmm11, %xmm10
-; SSE-NEXT:    pcmpeqd %xmm6, %xmm10
-; SSE-NEXT:    pxor %xmm11, %xmm10
-; SSE-NEXT:    pandn %xmm10, %xmm5
-; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
-; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
-; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
-; SSE-NEXT:    movdqa %xmm0, (%rdi)
-; SSE-NEXT:    movdqa %xmm8, %xmm0
-; SSE-NEXT:    movdqa %xmm9, %xmm1
-; SSE-NEXT:    movdqa %xmm4, %xmm2
-; SSE-NEXT:    movdqa %xmm5, %xmm3
+; SSE-NEXT:    movdqa %xmm0, %xmm8
+; SSE-NEXT:    psubd %xmm4, %xmm8
+; SSE-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm8, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psubd %xmm5, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    psubd %xmm6, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    movdqa %xmm3, %xmm6
+; SSE-NEXT:    psubd %xmm7, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm9, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    movdqa %xmm6, 48(%rdi)
+; SSE-NEXT:    movdqa %xmm5, 32(%rdi)
+; SSE-NEXT:    movdqa %xmm4, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm8, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v16i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm9, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm8
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm4, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm9, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm6, %xmm8
-; AVX1-NEXT:    vpsubd %xmm4, %xmm7, %xmm10
-; AVX1-NEXT:    vpcmpgtd %xmm10, %xmm9, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpandn %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm9, %xmm7
-; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm9, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm4, %xmm7
+; AVX1-NEXT:    vpsubd %xmm4, %xmm7, %xmm8
+; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm3, %xmm7
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm9, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpandn %xmm1, %xmm7, %xmm1
-; AVX1-NEXT:    vpackssdw %xmm6, %xmm1, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm9, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm9, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm1, %xmm6
-; AVX1-NEXT:    vpsubd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm9, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpandn %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm9, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm9, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm6, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsubd %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm5
 ; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm9, %xmm0
-; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm8, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
 ; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
@@ -830,40 +528,25 @@ define <16 x i32> @ssubo_v16i32(<16 x i3
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
 ; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-NEXT:    vmovdqa %xmm10, 48(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm8, 48(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm7, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm6, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ssubo_v16i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm5
-; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm1
-; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm7, %ymm1
-; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vpandn %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm5, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm5
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm7
-; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm2, %ymm4
 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm0
-; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm7, %ymm0
-; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0
-; AVX2-NEXT:    vpandn %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm1
@@ -877,13 +560,10 @@ define <16 x i32> @ssubo_v16i32(<16 x i3
 ; AVX512-LABEL: ssubo_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k0
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
@@ -1129,161 +809,42 @@ define <8 x i32> @ssubo_v8i16(<8 x i16>
 }
 
 define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
-; SSE2-LABEL: ssubo_v2i64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm5
-; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, (%rdi)
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    pandn %xmm0, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: ssubo_v2i64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    psubq %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm5
-; SSSE3-NEXT:    pxor %xmm1, %xmm5
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm0
-; SSSE3-NEXT:    pandn %xmm0, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: ssubo_v2i64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm4
-; SSE41-NEXT:    pxor %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm5
-; SSE41-NEXT:    pxor %xmm1, %xmm5
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT:    movdqa %xmm0, (%rdi)
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    pcmpeqq %xmm5, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    pandn %xmm2, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: ssubo_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    psubq %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
@@ -1291,18 +852,10 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
 ; AVX2-LABEL: ssubo_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
@@ -1310,13 +863,10 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
 ; AVX512-LABEL: ssubo_v2i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpcmpgtq %xmm2, %xmm1, %k0
 ; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k2
-; AVX512-NEXT:    kxorw %k2, %k1, %k1
-; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)




More information about the llvm-commits mailing list