[llvm] d10f23a - [ISel] Expand saddsat and ssubsat via asr and xor

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 19 08:08:19 PDT 2021


Author: David Green
Date: 2021-08-19T16:08:07+01:00
New Revision: d10f23a25d5c7b6dd61cbab8e16bf2099151be42

URL: https://github.com/llvm/llvm-project/commit/d10f23a25d5c7b6dd61cbab8e16bf2099151be42
DIFF: https://github.com/llvm/llvm-project/commit/d10f23a25d5c7b6dd61cbab8e16bf2099151be42.diff

LOG: [ISel] Expand saddsat and ssubsat via asr and xor

This changes the lowering of saddsat and ssubsat so that instead of
using:
  r,o = saddo x, y
  c = setcc r < 0
  s = c ? INTMAX : INTMIN
  ret o ? s : r
into using asr and xor to materialize the INTMAX/INTMIN constants:
  r,o = saddo x, y
  s = ashr r, BW-1
  x = xor s, INTMIN
  ret o ? x : r
https://alive2.llvm.org/ce/z/TYufgD

This seems to reduce the instruction count in most testcases across most
architectures. X86 has some custom lowering added to compensate for
cases where it can increase instruction count.

Differential Revision: https://reviews.llvm.org/D105853

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sadd_sat.ll
    llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
    llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
    llvm/test/CodeGen/AArch64/ssub_sat.ll
    llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
    llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
    llvm/test/CodeGen/AMDGPU/saddsat.ll
    llvm/test/CodeGen/AMDGPU/ssubsat.ll
    llvm/test/CodeGen/ARM/qdadd.ll
    llvm/test/CodeGen/ARM/sadd_sat.ll
    llvm/test/CodeGen/ARM/sadd_sat_plus.ll
    llvm/test/CodeGen/ARM/ssub_sat.ll
    llvm/test/CodeGen/ARM/ssub_sat_plus.ll
    llvm/test/CodeGen/PowerPC/sat-add.ll
    llvm/test/CodeGen/RISCV/sadd_sat.ll
    llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
    llvm/test/CodeGen/RISCV/ssub_sat.ll
    llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
    llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
    llvm/test/CodeGen/X86/combine-add-ssat.ll
    llvm/test/CodeGen/X86/sadd_sat.ll
    llvm/test/CodeGen/X86/sadd_sat_plus.ll
    llvm/test/CodeGen/X86/sadd_sat_vec.ll
    llvm/test/CodeGen/X86/ssub_sat.ll
    llvm/test/CodeGen/X86/ssub_sat_plus.ll
    llvm/test/CodeGen/X86/ssub_sat_vec.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ceca4aaf4188..8dfdb00f64bf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -816,7 +816,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
 
   // Shift cannot use a min/max expansion, we can't detect overflow if all of
   // the bits have been shifted out.
-  if (IsShift || TLI.isOperationLegalOrCustom(Opcode, PromotedType)) {
+  if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) {
     unsigned ShiftOp;
     switch (Opcode) {
     case ISD::SADDSAT:

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cdb3aedf4d23..accda2588c88 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8103,14 +8103,12 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
     return DAG.getSelect(dl, VT, Overflow, Zero, SumDiff);
   }
 
-  // SatMax -> Overflow && SumDiff < 0
-  // SatMin -> Overflow && SumDiff >= 0
+  // Overflow ? (SumDiff >> BW) ^ MinVal : SumDiff
   APInt MinVal = APInt::getSignedMinValue(BitWidth);
-  APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
   SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
-  SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
-  SDValue SumNeg = DAG.getSetCC(dl, BoolVT, SumDiff, Zero, ISD::SETLT);
-  Result = DAG.getSelect(dl, VT, SumNeg, SatMax, SatMin);
+  SDValue Shift = DAG.getNode(ISD::SRA, dl, VT, SumDiff,
+                              DAG.getConstant(BitWidth - 1, dl, VT));
+  Result = DAG.getNode(ISD::XOR, dl, VT, Shift, SatMin);
   return DAG.getSelect(dl, VT, Overflow, Result, SumDiff);
 }
 
@@ -8421,7 +8419,7 @@ void TargetLowering::expandSADDSUBO(
 
   // If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
   unsigned OpcSat = IsAdd ? ISD::SADDSAT : ISD::SSUBSAT;
-  if (isOperationLegalOrCustom(OpcSat, LHS.getValueType())) {
+  if (isOperationLegal(OpcSat, LHS.getValueType())) {
     SDValue Sat = DAG.getNode(OpcSat, dl, LHS.getValueType(), LHS, RHS);
     SDValue SetCC = DAG.getSetCC(dl, OType, Result, Sat, ISD::SETNE);
     Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 172dc35b5923..3595bc57e4d8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -207,6 +207,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
   }
 
+  // Signed saturation subtraction.
+  setOperationAction(ISD::SSUBSAT          , MVT::i8   , Custom);
+  setOperationAction(ISD::SSUBSAT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SSUBSAT          , MVT::i32  , Custom);
+  if (Subtarget.is64Bit())
+    setOperationAction(ISD::SSUBSAT        , MVT::i64  , Custom);
+
   // Funnel shifts.
   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
     // For slow shld targets we only lower for code size.
@@ -1142,6 +1149,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 
     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
+    setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
 
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
@@ -27958,6 +27967,25 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
     return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
   }
 
+  if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
+      (!VT.isVector() || VT == MVT::v2i64)) {
+    unsigned BitWidth = VT.getScalarSizeInBits();
+    APInt MinVal = APInt::getSignedMinValue(BitWidth);
+    APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    SDValue Result =
+        DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
+                    DAG.getVTList(VT, SetCCResultType), X, Y);
+    SDValue SumDiff = Result.getValue(0);
+    SDValue Overflow = Result.getValue(1);
+    SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
+    SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
+    SDValue SumNeg =
+        DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
+    Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
+    return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
+  }
+
   // Use default expansion.
   return SDValue();
 }

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll
index 89ff9fad542b..9ed83dc7850e 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll
@@ -12,11 +12,9 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: func:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adds w8, w0, w1
-; CHECK-NEXT:    mov w9, #2147483647
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cinv w8, w9, ge
-; CHECK-NEXT:    adds w9, w0, w1
-; CHECK-NEXT:    csel w0, w8, w9, vs
+; CHECK-NEXT:    asr w9, w8, #31
+; CHECK-NEXT:    eor w9, w9, #0x80000000
+; CHECK-NEXT:    csel w0, w9, w8, vs
 ; CHECK-NEXT:    ret
   %tmp = call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y);
   ret i32 %tmp;
@@ -26,11 +24,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: func2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adds x8, x0, x1
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    cinv x8, x9, ge
-; CHECK-NEXT:    adds x9, x0, x1
-; CHECK-NEXT:    csel x0, x8, x9, vs
+; CHECK-NEXT:    asr x9, x8, #63
+; CHECK-NEXT:    eor x9, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x9, x8, vs
 ; CHECK-NEXT:    ret
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y);
   ret i64 %tmp;

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
index 1b631448181a..8ea35fdf9596 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
@@ -11,11 +11,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-LABEL: func32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mul w8, w1, w2
-; CHECK-NEXT:    adds w10, w0, w8
-; CHECK-NEXT:    mov w9, #2147483647
-; CHECK-NEXT:    cmp w10, #0
-; CHECK-NEXT:    cinv w9, w9, ge
 ; CHECK-NEXT:    adds w8, w0, w8
+; CHECK-NEXT:    asr w9, w8, #31
+; CHECK-NEXT:    eor w9, w9, #0x80000000
 ; CHECK-NEXT:    csel w0, w9, w8, vs
 ; CHECK-NEXT:    ret
   %a = mul i32 %y, %z
@@ -27,11 +25,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-LABEL: func64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adds x8, x0, x2
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    cinv x8, x9, ge
-; CHECK-NEXT:    adds x9, x0, x2
-; CHECK-NEXT:    csel x0, x8, x9, vs
+; CHECK-NEXT:    asr x9, x8, #63
+; CHECK-NEXT:    eor x9, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x9, x8, vs
 ; CHECK-NEXT:    ret
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %z)

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 53e3563e7cea..cda08e950d78 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -351,26 +351,23 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-LABEL: v2i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adds x8, x2, x6
-; CHECK-NEXT:    adcs x12, x3, x7
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    eor x10, x3, x7
-; CHECK-NEXT:    cmp x12, #0
-; CHECK-NEXT:    eor x13, x3, x12
-; CHECK-NEXT:    cinv x14, x9, ge
-; CHECK-NEXT:    bics xzr, x13, x10
-; CHECK-NEXT:    asr x10, x12, #63
-; CHECK-NEXT:    csel x2, x10, x8, lt
-; CHECK-NEXT:    csel x3, x14, x12, lt
+; CHECK-NEXT:    adcs x11, x3, x7
+; CHECK-NEXT:    eor x9, x3, x7
+; CHECK-NEXT:    eor x12, x3, x11
+; CHECK-NEXT:    bics xzr, x12, x9
+; CHECK-NEXT:    asr x9, x11, #63
+; CHECK-NEXT:    eor x12, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x2, x9, x8, lt
+; CHECK-NEXT:    csel x3, x12, x11, lt
 ; CHECK-NEXT:    adds x8, x0, x4
-; CHECK-NEXT:    adcs x10, x1, x5
-; CHECK-NEXT:    eor x11, x1, x5
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    eor x12, x1, x10
-; CHECK-NEXT:    cinv x9, x9, ge
-; CHECK-NEXT:    bics xzr, x12, x11
-; CHECK-NEXT:    asr x11, x10, #63
+; CHECK-NEXT:    adcs x9, x1, x5
+; CHECK-NEXT:    eor x10, x1, x5
+; CHECK-NEXT:    eor x12, x1, x9
+; CHECK-NEXT:    asr x11, x9, #63
+; CHECK-NEXT:    bics xzr, x12, x10
+; CHECK-NEXT:    eor x13, x11, #0x8000000000000000
 ; CHECK-NEXT:    csel x8, x11, x8, lt
-; CHECK-NEXT:    csel x1, x9, x10, lt
+; CHECK-NEXT:    csel x1, x13, x9, lt
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll
index 2dd854d2898b..e5b25832db7a 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll
@@ -12,11 +12,9 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: func:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    subs w8, w0, w1
-; CHECK-NEXT:    mov w9, #2147483647
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cinv w8, w9, ge
-; CHECK-NEXT:    subs w9, w0, w1
-; CHECK-NEXT:    csel w0, w8, w9, vs
+; CHECK-NEXT:    asr w9, w8, #31
+; CHECK-NEXT:    eor w9, w9, #0x80000000
+; CHECK-NEXT:    csel w0, w9, w8, vs
 ; CHECK-NEXT:    ret
   %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y);
   ret i32 %tmp;
@@ -26,11 +24,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: func2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    subs x8, x0, x1
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    cinv x8, x9, ge
-; CHECK-NEXT:    subs x9, x0, x1
-; CHECK-NEXT:    csel x0, x8, x9, vs
+; CHECK-NEXT:    asr x9, x8, #63
+; CHECK-NEXT:    eor x9, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x9, x8, vs
 ; CHECK-NEXT:    ret
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y);
   ret i64 %tmp;

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
index 8a753b071c76..9c108a582c23 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
@@ -11,11 +11,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-LABEL: func32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mul w8, w1, w2
-; CHECK-NEXT:    subs w10, w0, w8
-; CHECK-NEXT:    mov w9, #2147483647
-; CHECK-NEXT:    cmp w10, #0
-; CHECK-NEXT:    cinv w9, w9, ge
 ; CHECK-NEXT:    subs w8, w0, w8
+; CHECK-NEXT:    asr w9, w8, #31
+; CHECK-NEXT:    eor w9, w9, #0x80000000
 ; CHECK-NEXT:    csel w0, w9, w8, vs
 ; CHECK-NEXT:    ret
   %a = mul i32 %y, %z
@@ -27,11 +25,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-LABEL: func64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    subs x8, x0, x2
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    cinv x8, x9, ge
-; CHECK-NEXT:    subs x9, x0, x2
-; CHECK-NEXT:    csel x0, x8, x9, vs
+; CHECK-NEXT:    asr x9, x8, #63
+; CHECK-NEXT:    eor x9, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x9, x8, vs
 ; CHECK-NEXT:    ret
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %z)

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 8babee052fed..77e8237faf1e 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -354,26 +354,23 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-LABEL: v2i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    subs x8, x2, x6
-; CHECK-NEXT:    sbcs x12, x3, x7
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    eor x10, x3, x7
-; CHECK-NEXT:    cmp x12, #0
-; CHECK-NEXT:    eor x13, x3, x12
-; CHECK-NEXT:    cinv x14, x9, ge
-; CHECK-NEXT:    tst x10, x13
-; CHECK-NEXT:    asr x10, x12, #63
-; CHECK-NEXT:    csel x2, x10, x8, lt
-; CHECK-NEXT:    csel x3, x14, x12, lt
+; CHECK-NEXT:    sbcs x11, x3, x7
+; CHECK-NEXT:    eor x9, x3, x7
+; CHECK-NEXT:    eor x12, x3, x11
+; CHECK-NEXT:    tst x9, x12
+; CHECK-NEXT:    asr x9, x11, #63
+; CHECK-NEXT:    eor x12, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x2, x9, x8, lt
+; CHECK-NEXT:    csel x3, x12, x11, lt
 ; CHECK-NEXT:    subs x8, x0, x4
-; CHECK-NEXT:    sbcs x10, x1, x5
-; CHECK-NEXT:    eor x11, x1, x5
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    eor x12, x1, x10
-; CHECK-NEXT:    cinv x9, x9, ge
-; CHECK-NEXT:    tst x11, x12
-; CHECK-NEXT:    asr x11, x10, #63
+; CHECK-NEXT:    sbcs x9, x1, x5
+; CHECK-NEXT:    eor x10, x1, x5
+; CHECK-NEXT:    eor x12, x1, x9
+; CHECK-NEXT:    asr x11, x9, #63
+; CHECK-NEXT:    tst x10, x12
+; CHECK-NEXT:    eor x13, x11, #0x8000000000000000
 ; CHECK-NEXT:    csel x8, x11, x8, lt
-; CHECK-NEXT:    csel x1, x9, x10, lt
+; CHECK-NEXT:    csel x1, x13, x9, lt
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0

diff  --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 3c2b66c302c1..827d23dbd463 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -62,10 +62,8 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -93,10 +91,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v0, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -107,10 +103,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -159,19 +153,18 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v4, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -228,26 +221,25 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -313,19 +305,18 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -333,17 +324,17 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_add_u16_e32 v5, v4, v2
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -376,17 +367,16 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v5, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v1, v3
-; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
+; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -397,17 +387,16 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v5, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v1, v3
-; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -438,13 +427,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -455,13 +441,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -472,13 +455,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -487,15 +467,13 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX10-NEXT:    v_bfrev_b32_e32 v6, -2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s5, 0, v[4:5]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v6, s5
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)

diff  --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index fde23b00aec5..3e8a725ca6c8 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -62,10 +62,8 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -93,10 +91,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
 ; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v0, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -107,10 +103,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -159,19 +153,18 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v4, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -229,26 +222,25 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -314,19 +306,18 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -334,17 +325,17 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v5, v4, v2
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -377,17 +368,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v5, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v3
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -398,17 +388,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v5, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v1, v3
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -438,24 +427,23 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v6, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v7, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v4
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v6, v7, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v2, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
+; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -466,24 +454,23 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v6, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v7, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v1, v4
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v6, v7, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v2, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -515,31 +502,30 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v8, v9, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v1, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v8, v9, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v8, v9, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
+; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v7
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
+; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -550,31 +536,30 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v9, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v9, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v1, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v8, v9, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v2, v6
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v9, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v7
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -608,59 +593,59 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v8
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v8
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v16, v17, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v1, v9
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v10
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v11
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
+; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v3, v16, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v4, v12
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v4, v16, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v13
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
+; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v5, v16, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v6, v14
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
+; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v6, v16, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v15
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
+; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v7, v16, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -671,59 +656,59 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v8
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v0, v8
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v17, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v17, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v1, v9
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v2, v10
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v3, v11
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v3, v16, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v4, v12
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v4, v16, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v5, v13
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v5, v16, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v6, v14
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v6, v16, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v7, v15
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v7, v16, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -765,115 +750,115 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v0, v16
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v32, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v33, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v32, v33, s[6:7]
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v1, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v32, v33, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v2, v18
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v3, v19
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_bfrev_b32_e32 v17, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v3, v17, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v4, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v4, v17, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v5, v21
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
+; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v5, v17, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v6, v22
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
+; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v6, v17, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v7, v23
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
+; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v7, v17, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v8, v24
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v8, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v8, v17, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v9, v25
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v9, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
+; GFX6-NEXT:    v_ashrrev_i32_e32 v9, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v9, v17, v9
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v10, v26
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v10, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
+; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v10, v17, v10
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v11, v27
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v11, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
+; GFX6-NEXT:    v_ashrrev_i32_e32 v11, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v11, v17, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v12, v28
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v12, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
+; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v12, v17, v12
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v13, v29
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v13, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
+; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v13, v17, v13
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v14, v30
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v14, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
+; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v14, v17, v14
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v15, v31
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v15, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
+; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v15, v17, v15
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -884,115 +869,115 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v0, v16
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v32, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v33, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v32, v33, s[6:7]
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v1, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v32, v33, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v2, v18
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v3, v19
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_bfrev_b32_e32 v17, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v3, v17, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v4, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v4, v17, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v5, v21
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v5, v17, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v6, v22
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v6, v17, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v7, v23
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v7, v17, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v8, v24
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v8, v17, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v9, v25
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v9, v17, v9
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v10, v26
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
+; GFX8-NEXT:    v_ashrrev_i32_e32 v10, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v10, v17, v10
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v11, v27
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
+; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v11, v17, v11
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v12, v28
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
+; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v12, v17, v12
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v13, v29
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
+; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v13, v17, v13
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v14, v30
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v14, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
+; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v14, v17, v14
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v15, v31
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v15, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
+; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v15, v17, v15
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1052,13 +1037,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1069,13 +1051,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1086,13 +1065,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1101,15 +1077,13 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
-; GFX10-NEXT:    v_bfrev_b32_e32 v6, -2
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s5, 0, v[4:5]
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v6, s5
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)

diff  --git a/llvm/test/CodeGen/ARM/qdadd.ll b/llvm/test/CodeGen/ARM/qdadd.ll
index 94d165c2502a..be35c75cc5de 100644
--- a/llvm/test/CodeGen/ARM/qdadd.ll
+++ b/llvm/test/CodeGen/ARM/qdadd.ll
@@ -7,32 +7,14 @@
 define i32 @qdadd(i32 %x, i32 %y) nounwind {
 ; CHECK-T2NODSP-LABEL: qdadd:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    adds.w r12, r0, r0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r12, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
-; CHECK-T2NODSP-NEXT:    adds r0, r3, r1
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r0
 ; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r0, r3
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r0
-; CHECK-T2NODSP-NEXT:    mov r0, r2
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r1
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: qdadd:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -51,32 +33,14 @@ define i32 @qdadd(i32 %x, i32 %y) nounwind {
 define i32 @qdadd_c(i32 %x, i32 %y) nounwind {
 ; CHECK-T2NODSP-LABEL: qdadd_c:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    adds.w r12, r0, r0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r12, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
-; CHECK-T2NODSP-NEXT:    adds r0, r1, r3
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r0
 ; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r0
-; CHECK-T2NODSP-NEXT:    mov r0, r2
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r1
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: qdadd_c:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -95,32 +59,14 @@ define i32 @qdadd_c(i32 %x, i32 %y) nounwind {
 define i32 @qdsub(i32 %x, i32 %y) nounwind {
 ; CHECK-T2NODSP-LABEL: qdsub:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    adds.w r12, r0, r0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r12, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
-; CHECK-T2NODSP-NEXT:    subs r0, r1, r3
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r0
 ; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r1, r3
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r0
-; CHECK-T2NODSP-NEXT:    mov r0, r2
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    subs r0, r1, r0
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: qdsub:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -139,32 +85,14 @@ define i32 @qdsub(i32 %x, i32 %y) nounwind {
 define i32 @qdsub_c(i32 %x, i32 %y) nounwind {
 ; CHECK-T2NODSP-LABEL: qdsub_c:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    adds.w r12, r0, r0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r12, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
-; CHECK-T2NODSP-NEXT:    subs r0, r3, r1
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r0
 ; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r3, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r0
-; CHECK-T2NODSP-NEXT:    mov r0, r2
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: qdsub_c:
 ; CHECK-T2DSP:       @ %bb.0:

diff  --git a/llvm/test/CodeGen/ARM/sadd_sat.ll b/llvm/test/CodeGen/ARM/sadd_sat.ll
index 6d5e3e3d216c..287e52d5044d 100644
--- a/llvm/test/CodeGen/ARM/sadd_sat.ll
+++ b/llvm/test/CodeGen/ARM/sadd_sat.ll
@@ -16,48 +16,22 @@ declare i64 @llvm.sadd.sat.i64(i64, i64)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; CHECK-T1-LABEL: func:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    mov r2, r0
-; CHECK-T1-NEXT:    movs r3, #1
 ; CHECK-T1-NEXT:    adds r0, r0, r1
-; CHECK-T1-NEXT:    mov r1, r3
-; CHECK-T1-NEXT:    bmi .LBB0_2
+; CHECK-T1-NEXT:    bvc .LBB0_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    movs r1, #0
+; CHECK-T1-NEXT:    asrs r1, r0, #31
+; CHECK-T1-NEXT:    movs r0, #1
+; CHECK-T1-NEXT:    lsls r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r1
 ; CHECK-T1-NEXT:  .LBB0_2:
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bne .LBB0_4
-; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    lsls r1, r3, #31
-; CHECK-T1-NEXT:    cmp r0, r2
-; CHECK-T1-NEXT:    bvs .LBB0_5
-; CHECK-T1-NEXT:    b .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_4:
-; CHECK-T1-NEXT:    ldr r1, .LCPI0_0
-; CHECK-T1-NEXT:    cmp r0, r2
-; CHECK-T1-NEXT:    bvc .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_5:
-; CHECK-T1-NEXT:    mov r0, r1
-; CHECK-T1-NEXT:  .LBB0_6:
 ; CHECK-T1-NEXT:    bx lr
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI0_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2NODSP-LABEL: func:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    adds r2, r0, r1
-; CHECK-T2NODSP-NEXT:    mov.w r3, #0
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r1
 ; CHECK-T2NODSP-NEXT:    mov.w r1, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r1, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r2, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r1, r2
-; CHECK-T2NODSP-NEXT:    mov r0, r1
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r1, r0, asr #31
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func:
@@ -67,15 +41,9 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ;
 ; CHECK-ARMNODPS-LABEL: func:
 ; CHECK-ARMNODPS:       @ %bb.0:
-; CHECK-ARMNODPS-NEXT:    adds r2, r0, r1
-; CHECK-ARMNODPS-NEXT:    mov r3, #0
-; CHECK-ARMNODPS-NEXT:    movmi r3, #1
+; CHECK-ARMNODPS-NEXT:    adds r0, r0, r1
 ; CHECK-ARMNODPS-NEXT:    mov r1, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r3, #0
-; CHECK-ARMNODPS-NEXT:    mvnne r1, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r2, r0
-; CHECK-ARMNODPS-NEXT:    movvc r1, r2
-; CHECK-ARMNODPS-NEXT:    mov r0, r1
+; CHECK-ARMNODPS-NEXT:    eorvs r0, r1, r0, asr #31
 ; CHECK-ARMNODPS-NEXT:    bx lr
 ;
 ; CHECK-ARMBASEDSP-LABEL: func:
@@ -97,36 +65,28 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-T1-NEXT:    .save {r4, lr}
 ; CHECK-T1-NEXT:    push {r4, lr}
 ; CHECK-T1-NEXT:    mov r4, r1
-; CHECK-T1-NEXT:    eors r4, r3
-; CHECK-T1-NEXT:    adds r0, r0, r2
-; CHECK-T1-NEXT:    adcs r3, r1
 ; CHECK-T1-NEXT:    eors r1, r3
-; CHECK-T1-NEXT:    bics r1, r4
-; CHECK-T1-NEXT:    bpl .LBB1_2
+; CHECK-T1-NEXT:    adds r2, r0, r2
+; CHECK-T1-NEXT:    adcs r3, r4
+; CHECK-T1-NEXT:    eors r4, r3
+; CHECK-T1-NEXT:    bics r4, r1
+; CHECK-T1-NEXT:    asrs r1, r3, #31
+; CHECK-T1-NEXT:    cmp r4, #0
+; CHECK-T1-NEXT:    mov r0, r1
+; CHECK-T1-NEXT:    bmi .LBB1_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    asrs r0, r3, #31
+; CHECK-T1-NEXT:    mov r0, r2
 ; CHECK-T1-NEXT:  .LBB1_2:
-; CHECK-T1-NEXT:    cmp r3, #0
+; CHECK-T1-NEXT:    cmp r4, #0
 ; CHECK-T1-NEXT:    bmi .LBB1_4
 ; CHECK-T1-NEXT:  @ %bb.3:
+; CHECK-T1-NEXT:    mov r1, r3
+; CHECK-T1-NEXT:    pop {r4, pc}
+; CHECK-T1-NEXT:  .LBB1_4:
 ; CHECK-T1-NEXT:    movs r2, #1
 ; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bpl .LBB1_5
-; CHECK-T1-NEXT:    b .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI1_0
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bmi .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_5:
-; CHECK-T1-NEXT:    mov r2, r3
-; CHECK-T1-NEXT:  .LBB1_6:
-; CHECK-T1-NEXT:    mov r1, r2
+; CHECK-T1-NEXT:    eors r1, r2
 ; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI1_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2-LABEL: func2:
 ; CHECK-T2:       @ %bb.0:
@@ -134,17 +94,14 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-T2-NEXT:    eor.w r12, r1, r3
 ; CHECK-T2-NEXT:    adc.w r2, r1, r3
 ; CHECK-T2-NEXT:    eors r1, r2
-; CHECK-T2-NEXT:    bic.w r3, r1, r12
+; CHECK-T2-NEXT:    bic.w r1, r1, r12
+; CHECK-T2-NEXT:    cmp r1, #0
 ; CHECK-T2-NEXT:    mov.w r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
 ; CHECK-T2-NEXT:    it mi
 ; CHECK-T2-NEXT:    asrmi r0, r2, #31
-; CHECK-T2-NEXT:    cmp r2, #0
 ; CHECK-T2-NEXT:    it mi
-; CHECK-T2-NEXT:    mvnmi r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
-; CHECK-T2-NEXT:    it pl
-; CHECK-T2-NEXT:    movpl r1, r2
+; CHECK-T2-NEXT:    eormi.w r2, r1, r2, asr #31
+; CHECK-T2-NEXT:    mov r1, r2
 ; CHECK-T2-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func2:
@@ -153,14 +110,12 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-ARM-NEXT:    eor r12, r1, r3
 ; CHECK-ARM-NEXT:    adc r2, r1, r3
 ; CHECK-ARM-NEXT:    eor r1, r1, r2
-; CHECK-ARM-NEXT:    bic r3, r1, r12
+; CHECK-ARM-NEXT:    bic r1, r1, r12
+; CHECK-ARM-NEXT:    cmp r1, #0
 ; CHECK-ARM-NEXT:    mov r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
 ; CHECK-ARM-NEXT:    asrmi r0, r2, #31
-; CHECK-ARM-NEXT:    cmp r2, #0
-; CHECK-ARM-NEXT:    mvnmi r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
-; CHECK-ARM-NEXT:    movpl r1, r2
+; CHECK-ARM-NEXT:    eormi r2, r1, r2, asr #31
+; CHECK-ARM-NEXT:    mov r1, r2
 ; CHECK-ARM-NEXT:    bx lr
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y)
   ret i64 %tmp

diff  --git a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
index f0a0b72db2fb..6a8d9def5663 100644
--- a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
@@ -13,49 +13,24 @@ declare i64 @llvm.sadd.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-T1-LABEL: func32:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    mov r3, r0
 ; CHECK-T1-NEXT:    muls r1, r2, r1
-; CHECK-T1-NEXT:    movs r2, #1
 ; CHECK-T1-NEXT:    adds r0, r0, r1
-; CHECK-T1-NEXT:    mov r1, r2
-; CHECK-T1-NEXT:    bmi .LBB0_2
+; CHECK-T1-NEXT:    bvc .LBB0_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    movs r1, #0
+; CHECK-T1-NEXT:    asrs r1, r0, #31
+; CHECK-T1-NEXT:    movs r0, #1
+; CHECK-T1-NEXT:    lsls r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r1
 ; CHECK-T1-NEXT:  .LBB0_2:
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bne .LBB0_4
-; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    lsls r1, r2, #31
-; CHECK-T1-NEXT:    cmp r0, r3
-; CHECK-T1-NEXT:    bvs .LBB0_5
-; CHECK-T1-NEXT:    b .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_4:
-; CHECK-T1-NEXT:    ldr r1, .LCPI0_0
-; CHECK-T1-NEXT:    cmp r0, r3
-; CHECK-T1-NEXT:    bvc .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_5:
-; CHECK-T1-NEXT:    mov r0, r1
-; CHECK-T1-NEXT:  .LBB0_6:
 ; CHECK-T1-NEXT:    bx lr
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI0_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2NODSP-LABEL: func32:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    mla r2, r1, r2, r0
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r1, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r2, #0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r1, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r2, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r1, r2
+; CHECK-T2NODSP-NEXT:    mla r1, r1, r2, r0
+; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
+; CHECK-T2NODSP-NEXT:    cmp r1, r0
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r1, r2, r1, asr #31
 ; CHECK-T2NODSP-NEXT:    mov r0, r1
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
@@ -84,35 +59,28 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-T1-NEXT:    mov r2, r1
 ; CHECK-T1-NEXT:    eors r2, r3
 ; CHECK-T1-NEXT:    ldr r4, [sp, #8]
-; CHECK-T1-NEXT:    adds r0, r0, r4
+; CHECK-T1-NEXT:    adds r4, r0, r4
 ; CHECK-T1-NEXT:    adcs r3, r1
 ; CHECK-T1-NEXT:    eors r1, r3
 ; CHECK-T1-NEXT:    bics r1, r2
-; CHECK-T1-NEXT:    bpl .LBB1_2
+; CHECK-T1-NEXT:    asrs r2, r3, #31
+; CHECK-T1-NEXT:    cmp r1, #0
+; CHECK-T1-NEXT:    mov r0, r2
+; CHECK-T1-NEXT:    bmi .LBB1_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    asrs r0, r3, #31
+; CHECK-T1-NEXT:    mov r0, r4
 ; CHECK-T1-NEXT:  .LBB1_2:
-; CHECK-T1-NEXT:    cmp r3, #0
+; CHECK-T1-NEXT:    cmp r1, #0
 ; CHECK-T1-NEXT:    bmi .LBB1_4
 ; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    movs r2, #1
-; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bpl .LBB1_5
-; CHECK-T1-NEXT:    b .LBB1_6
+; CHECK-T1-NEXT:    mov r1, r3
+; CHECK-T1-NEXT:    pop {r4, pc}
 ; CHECK-T1-NEXT:  .LBB1_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI1_0
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bmi .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_5:
-; CHECK-T1-NEXT:    mov r2, r3
-; CHECK-T1-NEXT:  .LBB1_6:
+; CHECK-T1-NEXT:    movs r1, #1
+; CHECK-T1-NEXT:    lsls r1, r1, #31
+; CHECK-T1-NEXT:    eors r2, r1
 ; CHECK-T1-NEXT:    mov r1, r2
 ; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI1_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2-LABEL: func64:
 ; CHECK-T2:       @ %bb.0:
@@ -122,17 +90,14 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-T2-NEXT:    adc.w r2, r1, r12
 ; CHECK-T2-NEXT:    eor.w r3, r1, r12
 ; CHECK-T2-NEXT:    eors r1, r2
-; CHECK-T2-NEXT:    bic.w r3, r1, r3
+; CHECK-T2-NEXT:    bics r1, r3
+; CHECK-T2-NEXT:    cmp r1, #0
 ; CHECK-T2-NEXT:    mov.w r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
 ; CHECK-T2-NEXT:    it mi
 ; CHECK-T2-NEXT:    asrmi r0, r2, #31
-; CHECK-T2-NEXT:    cmp r2, #0
 ; CHECK-T2-NEXT:    it mi
-; CHECK-T2-NEXT:    mvnmi r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
-; CHECK-T2-NEXT:    it pl
-; CHECK-T2-NEXT:    movpl r1, r2
+; CHECK-T2-NEXT:    eormi.w r2, r1, r2, asr #31
+; CHECK-T2-NEXT:    mov r1, r2
 ; CHECK-T2-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func64:
@@ -143,14 +108,12 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-ARM-NEXT:    eor r3, r1, r2
 ; CHECK-ARM-NEXT:    adc r2, r1, r2
 ; CHECK-ARM-NEXT:    eor r1, r1, r2
-; CHECK-ARM-NEXT:    bic r3, r1, r3
+; CHECK-ARM-NEXT:    bic r1, r1, r3
+; CHECK-ARM-NEXT:    cmp r1, #0
 ; CHECK-ARM-NEXT:    mov r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
 ; CHECK-ARM-NEXT:    asrmi r0, r2, #31
-; CHECK-ARM-NEXT:    cmp r2, #0
-; CHECK-ARM-NEXT:    mvnmi r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
-; CHECK-ARM-NEXT:    movpl r1, r2
+; CHECK-ARM-NEXT:    eormi r2, r1, r2, asr #31
+; CHECK-ARM-NEXT:    mov r1, r2
 ; CHECK-ARM-NEXT:    bx lr
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %z)

diff  --git a/llvm/test/CodeGen/ARM/ssub_sat.ll b/llvm/test/CodeGen/ARM/ssub_sat.ll
index 5edbb8e28067..30d7a683654a 100644
--- a/llvm/test/CodeGen/ARM/ssub_sat.ll
+++ b/llvm/test/CodeGen/ARM/ssub_sat.ll
@@ -16,50 +16,22 @@ declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; CHECK-T1-LABEL: func:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    .save {r4, lr}
-; CHECK-T1-NEXT:    push {r4, lr}
-; CHECK-T1-NEXT:    mov r2, r0
-; CHECK-T1-NEXT:    movs r3, #1
 ; CHECK-T1-NEXT:    subs r0, r0, r1
-; CHECK-T1-NEXT:    mov r4, r3
-; CHECK-T1-NEXT:    bmi .LBB0_2
+; CHECK-T1-NEXT:    bvc .LBB0_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    movs r4, #0
+; CHECK-T1-NEXT:    asrs r1, r0, #31
+; CHECK-T1-NEXT:    movs r0, #1
+; CHECK-T1-NEXT:    lsls r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r1
 ; CHECK-T1-NEXT:  .LBB0_2:
-; CHECK-T1-NEXT:    cmp r4, #0
-; CHECK-T1-NEXT:    bne .LBB0_4
-; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    lsls r3, r3, #31
-; CHECK-T1-NEXT:    cmp r2, r1
-; CHECK-T1-NEXT:    bvs .LBB0_5
-; CHECK-T1-NEXT:    b .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_4:
-; CHECK-T1-NEXT:    ldr r3, .LCPI0_0
-; CHECK-T1-NEXT:    cmp r2, r1
-; CHECK-T1-NEXT:    bvc .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_5:
-; CHECK-T1-NEXT:    mov r0, r3
-; CHECK-T1-NEXT:  .LBB0_6:
-; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI0_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
+; CHECK-T1-NEXT:    bx lr
 ;
 ; CHECK-T2NODSP-LABEL: func:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    subs.w r12, r0, r1
-; CHECK-T2NODSP-NEXT:    mov.w r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r12
-; CHECK-T2NODSP-NEXT:    mov r0, r2
+; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
+; CHECK-T2NODSP-NEXT:    mov.w r1, #-2147483648
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r1, r0, asr #31
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func:
@@ -69,15 +41,9 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ;
 ; CHECK-ARMNODPS-LABEL: func:
 ; CHECK-ARMNODPS:       @ %bb.0:
-; CHECK-ARMNODPS-NEXT:    subs r12, r0, r1
-; CHECK-ARMNODPS-NEXT:    mov r3, #0
-; CHECK-ARMNODPS-NEXT:    movmi r3, #1
-; CHECK-ARMNODPS-NEXT:    mov r2, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r3, #0
-; CHECK-ARMNODPS-NEXT:    mvnne r2, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r0, r1
-; CHECK-ARMNODPS-NEXT:    movvc r2, r12
-; CHECK-ARMNODPS-NEXT:    mov r0, r2
+; CHECK-ARMNODPS-NEXT:    subs r0, r0, r1
+; CHECK-ARMNODPS-NEXT:    mov r1, #-2147483648
+; CHECK-ARMNODPS-NEXT:    eorvs r0, r1, r0, asr #31
 ; CHECK-ARMNODPS-NEXT:    bx lr
 ;
 ; CHECK-ARMBASEDSP-LABEL: func:
@@ -98,38 +64,30 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-T1:       @ %bb.0:
 ; CHECK-T1-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-T1-NEXT:    push {r4, r5, r7, lr}
-; CHECK-T1-NEXT:    mov r5, r1
-; CHECK-T1-NEXT:    eors r5, r3
-; CHECK-T1-NEXT:    subs r0, r0, r2
 ; CHECK-T1-NEXT:    mov r4, r1
-; CHECK-T1-NEXT:    sbcs r4, r3
-; CHECK-T1-NEXT:    eors r1, r4
-; CHECK-T1-NEXT:    ands r1, r5
-; CHECK-T1-NEXT:    bpl .LBB1_2
+; CHECK-T1-NEXT:    eors r1, r3
+; CHECK-T1-NEXT:    subs r5, r0, r2
+; CHECK-T1-NEXT:    mov r2, r4
+; CHECK-T1-NEXT:    sbcs r2, r3
+; CHECK-T1-NEXT:    eors r4, r2
+; CHECK-T1-NEXT:    ands r4, r1
+; CHECK-T1-NEXT:    asrs r1, r2, #31
+; CHECK-T1-NEXT:    cmp r4, #0
+; CHECK-T1-NEXT:    mov r0, r1
+; CHECK-T1-NEXT:    bmi .LBB1_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    asrs r0, r4, #31
+; CHECK-T1-NEXT:    mov r0, r5
 ; CHECK-T1-NEXT:  .LBB1_2:
 ; CHECK-T1-NEXT:    cmp r4, #0
 ; CHECK-T1-NEXT:    bmi .LBB1_4
 ; CHECK-T1-NEXT:  @ %bb.3:
+; CHECK-T1-NEXT:    mov r1, r2
+; CHECK-T1-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-T1-NEXT:  .LBB1_4:
 ; CHECK-T1-NEXT:    movs r2, #1
 ; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bpl .LBB1_5
-; CHECK-T1-NEXT:    b .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI1_0
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bmi .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_5:
-; CHECK-T1-NEXT:    mov r2, r4
-; CHECK-T1-NEXT:  .LBB1_6:
-; CHECK-T1-NEXT:    mov r1, r2
+; CHECK-T1-NEXT:    eors r1, r2
 ; CHECK-T1-NEXT:    pop {r4, r5, r7, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI1_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2-LABEL: func2:
 ; CHECK-T2:       @ %bb.0:
@@ -137,16 +95,13 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-T2-NEXT:    eor.w r12, r1, r3
 ; CHECK-T2-NEXT:    sbc.w r2, r1, r3
 ; CHECK-T2-NEXT:    eors r1, r2
-; CHECK-T2-NEXT:    ands.w r3, r12, r1
-; CHECK-T2-NEXT:    mov.w r1, #-2147483648
+; CHECK-T2-NEXT:    ands.w r1, r1, r12
 ; CHECK-T2-NEXT:    it mi
 ; CHECK-T2-NEXT:    asrmi r0, r2, #31
-; CHECK-T2-NEXT:    cmp r2, #0
+; CHECK-T2-NEXT:    mov.w r1, #-2147483648
 ; CHECK-T2-NEXT:    it mi
-; CHECK-T2-NEXT:    mvnmi r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
-; CHECK-T2-NEXT:    it pl
-; CHECK-T2-NEXT:    movpl r1, r2
+; CHECK-T2-NEXT:    eormi.w r2, r1, r2, asr #31
+; CHECK-T2-NEXT:    mov r1, r2
 ; CHECK-T2-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func2:
@@ -155,13 +110,11 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-ARM-NEXT:    eor r12, r1, r3
 ; CHECK-ARM-NEXT:    sbc r2, r1, r3
 ; CHECK-ARM-NEXT:    eor r1, r1, r2
-; CHECK-ARM-NEXT:    ands r3, r12, r1
-; CHECK-ARM-NEXT:    mov r1, #-2147483648
+; CHECK-ARM-NEXT:    ands r1, r12, r1
 ; CHECK-ARM-NEXT:    asrmi r0, r2, #31
-; CHECK-ARM-NEXT:    cmp r2, #0
-; CHECK-ARM-NEXT:    mvnmi r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
-; CHECK-ARM-NEXT:    movpl r1, r2
+; CHECK-ARM-NEXT:    mov r1, #-2147483648
+; CHECK-ARM-NEXT:    eormi r2, r1, r2, asr #31
+; CHECK-ARM-NEXT:    mov r1, r2
 ; CHECK-ARM-NEXT:    bx lr
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y)
   ret i64 %tmp
@@ -373,165 +326,64 @@ define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind {
 define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-T1-LABEL: vec:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-T1-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-T1-NEXT:    .pad #12
-; CHECK-T1-NEXT:    sub sp, #12
-; CHECK-T1-NEXT:    str r3, [sp] @ 4-byte Spill
-; CHECK-T1-NEXT:    mov r4, r1
-; CHECK-T1-NEXT:    mov r1, r0
-; CHECK-T1-NEXT:    ldr r5, [sp, #32]
-; CHECK-T1-NEXT:    movs r7, #1
-; CHECK-T1-NEXT:    movs r0, #0
-; CHECK-T1-NEXT:    str r0, [sp, #8] @ 4-byte Spill
-; CHECK-T1-NEXT:    subs r0, r1, r5
-; CHECK-T1-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; CHECK-T1-NEXT:    mov r6, r7
-; CHECK-T1-NEXT:    bmi .LBB5_2
+; CHECK-T1-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-T1-NEXT:    push {r4, r5, r6, lr}
+; CHECK-T1-NEXT:    mov r4, r0
+; CHECK-T1-NEXT:    ldr r6, [sp, #16]
+; CHECK-T1-NEXT:    subs r0, r0, r6
+; CHECK-T1-NEXT:    movs r5, #1
+; CHECK-T1-NEXT:    lsls r5, r5, #31
+; CHECK-T1-NEXT:    cmp r4, r6
+; CHECK-T1-NEXT:    bvc .LBB5_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-T1-NEXT:    asrs r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r5
 ; CHECK-T1-NEXT:  .LBB5_2:
-; CHECK-T1-NEXT:    lsls r3, r7, #31
-; CHECK-T1-NEXT:    ldr r0, .LCPI5_0
-; CHECK-T1-NEXT:    cmp r6, #0
-; CHECK-T1-NEXT:    mov r6, r0
-; CHECK-T1-NEXT:    bne .LBB5_4
+; CHECK-T1-NEXT:    ldr r4, [sp, #20]
+; CHECK-T1-NEXT:    subs r1, r1, r4
+; CHECK-T1-NEXT:    bvc .LBB5_4
 ; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    mov r6, r3
+; CHECK-T1-NEXT:    asrs r1, r1, #31
+; CHECK-T1-NEXT:    eors r1, r5
 ; CHECK-T1-NEXT:  .LBB5_4:
-; CHECK-T1-NEXT:    cmp r1, r5
+; CHECK-T1-NEXT:    ldr r4, [sp, #24]
+; CHECK-T1-NEXT:    subs r2, r2, r4
 ; CHECK-T1-NEXT:    bvc .LBB5_6
 ; CHECK-T1-NEXT:  @ %bb.5:
-; CHECK-T1-NEXT:    str r6, [sp, #4] @ 4-byte Spill
+; CHECK-T1-NEXT:    asrs r2, r2, #31
+; CHECK-T1-NEXT:    eors r2, r5
 ; CHECK-T1-NEXT:  .LBB5_6:
-; CHECK-T1-NEXT:    ldr r5, [sp, #36]
-; CHECK-T1-NEXT:    subs r1, r4, r5
-; CHECK-T1-NEXT:    mov r6, r7
-; CHECK-T1-NEXT:    bmi .LBB5_8
+; CHECK-T1-NEXT:    ldr r4, [sp, #28]
+; CHECK-T1-NEXT:    subs r3, r3, r4
+; CHECK-T1-NEXT:    bvc .LBB5_8
 ; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-T1-NEXT:    asrs r3, r3, #31
+; CHECK-T1-NEXT:    eors r3, r5
 ; CHECK-T1-NEXT:  .LBB5_8:
-; CHECK-T1-NEXT:    cmp r6, #0
-; CHECK-T1-NEXT:    mov r6, r0
-; CHECK-T1-NEXT:    bne .LBB5_10
-; CHECK-T1-NEXT:  @ %bb.9:
-; CHECK-T1-NEXT:    mov r6, r3
-; CHECK-T1-NEXT:  .LBB5_10:
-; CHECK-T1-NEXT:    cmp r4, r5
-; CHECK-T1-NEXT:    bvc .LBB5_12
-; CHECK-T1-NEXT:  @ %bb.11:
-; CHECK-T1-NEXT:    mov r1, r6
-; CHECK-T1-NEXT:  .LBB5_12:
-; CHECK-T1-NEXT:    ldr r5, [sp, #40]
-; CHECK-T1-NEXT:    subs r4, r2, r5
-; CHECK-T1-NEXT:    mov r6, r7
-; CHECK-T1-NEXT:    bmi .LBB5_14
-; CHECK-T1-NEXT:  @ %bb.13:
-; CHECK-T1-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
-; CHECK-T1-NEXT:  .LBB5_14:
-; CHECK-T1-NEXT:    cmp r6, #0
-; CHECK-T1-NEXT:    mov r6, r0
-; CHECK-T1-NEXT:    bne .LBB5_16
-; CHECK-T1-NEXT:  @ %bb.15:
-; CHECK-T1-NEXT:    mov r6, r3
-; CHECK-T1-NEXT:  .LBB5_16:
-; CHECK-T1-NEXT:    cmp r2, r5
-; CHECK-T1-NEXT:    bvc .LBB5_18
-; CHECK-T1-NEXT:  @ %bb.17:
-; CHECK-T1-NEXT:    mov r4, r6
-; CHECK-T1-NEXT:  .LBB5_18:
-; CHECK-T1-NEXT:    ldr r2, [sp, #44]
-; CHECK-T1-NEXT:    ldr r6, [sp] @ 4-byte Reload
-; CHECK-T1-NEXT:    subs r5, r6, r2
-; CHECK-T1-NEXT:    bpl .LBB5_23
-; CHECK-T1-NEXT:  @ %bb.19:
-; CHECK-T1-NEXT:    cmp r7, #0
-; CHECK-T1-NEXT:    beq .LBB5_24
-; CHECK-T1-NEXT:  .LBB5_20:
-; CHECK-T1-NEXT:    cmp r6, r2
-; CHECK-T1-NEXT:    bvc .LBB5_22
-; CHECK-T1-NEXT:  .LBB5_21:
-; CHECK-T1-NEXT:    mov r5, r0
-; CHECK-T1-NEXT:  .LBB5_22:
-; CHECK-T1-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-T1-NEXT:    mov r2, r4
-; CHECK-T1-NEXT:    mov r3, r5
-; CHECK-T1-NEXT:    add sp, #12
-; CHECK-T1-NEXT:    pop {r4, r5, r6, r7, pc}
-; CHECK-T1-NEXT:  .LBB5_23:
-; CHECK-T1-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-T1-NEXT:    cmp r7, #0
-; CHECK-T1-NEXT:    bne .LBB5_20
-; CHECK-T1-NEXT:  .LBB5_24:
-; CHECK-T1-NEXT:    mov r0, r3
-; CHECK-T1-NEXT:    cmp r6, r2
-; CHECK-T1-NEXT:    bvs .LBB5_21
-; CHECK-T1-NEXT:    b .LBB5_22
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.25:
-; CHECK-T1-NEXT:  .LCPI5_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
+; CHECK-T1-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; CHECK-T2NODSP-LABEL: vec:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-T2NODSP-NEXT:    .pad #4
-; CHECK-T2NODSP-NEXT:    sub sp, #4
-; CHECK-T2NODSP-NEXT:    ldr r4, [sp, #24]
-; CHECK-T2NODSP-NEXT:    mov lr, r0
-; CHECK-T2NODSP-NEXT:    ldr r7, [sp, #28]
-; CHECK-T2NODSP-NEXT:    movs r5, #0
-; CHECK-T2NODSP-NEXT:    subs r6, r0, r4
-; CHECK-T2NODSP-NEXT:    mov.w r0, #0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r0, #1
-; CHECK-T2NODSP-NEXT:    cmp r0, #0
-; CHECK-T2NODSP-NEXT:    mov.w r0, #-2147483648
+; CHECK-T2NODSP-NEXT:    .save {r7, lr}
+; CHECK-T2NODSP-NEXT:    push {r7, lr}
+; CHECK-T2NODSP-NEXT:    ldr.w r12, [sp, #8]
+; CHECK-T2NODSP-NEXT:    ldr.w lr, [sp, #12]
+; CHECK-T2NODSP-NEXT:    subs.w r0, r0, r12
 ; CHECK-T2NODSP-NEXT:    mov.w r12, #-2147483648
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r0, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp lr, r4
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r0, r6
-; CHECK-T2NODSP-NEXT:    subs r6, r1, r7
-; CHECK-T2NODSP-NEXT:    mov.w r4, #0
-; CHECK-T2NODSP-NEXT:    mov.w lr, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r4, #1
-; CHECK-T2NODSP-NEXT:    cmp r4, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne lr, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r1, r7
-; CHECK-T2NODSP-NEXT:    ldr r1, [sp, #32]
-; CHECK-T2NODSP-NEXT:    mov.w r4, #0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc lr, r6
-; CHECK-T2NODSP-NEXT:    subs r6, r2, r1
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r4, #1
-; CHECK-T2NODSP-NEXT:    cmp r4, #0
-; CHECK-T2NODSP-NEXT:    mov.w r4, #-2147483648
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r4, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r2, r1
-; CHECK-T2NODSP-NEXT:    ldr r1, [sp, #36]
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r4, r6
-; CHECK-T2NODSP-NEXT:    subs r2, r3, r1
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r5, #1
-; CHECK-T2NODSP-NEXT:    cmp r5, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r12, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r3, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r12, r2
-; CHECK-T2NODSP-NEXT:    mov r1, lr
-; CHECK-T2NODSP-NEXT:    mov r2, r4
-; CHECK-T2NODSP-NEXT:    mov r3, r12
-; CHECK-T2NODSP-NEXT:    add sp, #4
-; CHECK-T2NODSP-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r12, r0, asr #31
+; CHECK-T2NODSP-NEXT:    subs.w r1, r1, lr
+; CHECK-T2NODSP-NEXT:    ldr.w lr, [sp, #16]
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r1, r12, r1, asr #31
+; CHECK-T2NODSP-NEXT:    subs.w r2, r2, lr
+; CHECK-T2NODSP-NEXT:    ldr.w lr, [sp, #20]
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r2, r12, r2, asr #31
+; CHECK-T2NODSP-NEXT:    subs.w r3, r3, lr
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r3, r12, r3, asr #31
+; CHECK-T2NODSP-NEXT:    pop {r7, pc}
 ;
 ; CHECK-T2DSP-LABEL: vec:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -547,49 +399,22 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; CHECK-ARMNODPS-LABEL: vec:
 ; CHECK-ARMNODPS:       @ %bb.0:
-; CHECK-ARMNODPS-NEXT:    .save {r4, r5, r6, r7, r11, lr}
-; CHECK-ARMNODPS-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARMNODPS-NEXT:    ldr r4, [sp, #24]
-; CHECK-ARMNODPS-NEXT:    mov lr, r0
-; CHECK-ARMNODPS-NEXT:    ldr r7, [sp, #28]
-; CHECK-ARMNODPS-NEXT:    mov r5, #0
-; CHECK-ARMNODPS-NEXT:    subs r6, r0, r4
-; CHECK-ARMNODPS-NEXT:    mov r0, #0
-; CHECK-ARMNODPS-NEXT:    movmi r0, #1
-; CHECK-ARMNODPS-NEXT:    cmp r0, #0
-; CHECK-ARMNODPS-NEXT:    mov r0, #-2147483648
+; CHECK-ARMNODPS-NEXT:    .save {r11, lr}
+; CHECK-ARMNODPS-NEXT:    push {r11, lr}
+; CHECK-ARMNODPS-NEXT:    ldr r12, [sp, #8]
+; CHECK-ARMNODPS-NEXT:    ldr lr, [sp, #12]
+; CHECK-ARMNODPS-NEXT:    subs r0, r0, r12
 ; CHECK-ARMNODPS-NEXT:    mov r12, #-2147483648
-; CHECK-ARMNODPS-NEXT:    mvnne r0, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp lr, r4
-; CHECK-ARMNODPS-NEXT:    movvc r0, r6
-; CHECK-ARMNODPS-NEXT:    subs r6, r1, r7
-; CHECK-ARMNODPS-NEXT:    mov r4, #0
-; CHECK-ARMNODPS-NEXT:    mov lr, #-2147483648
-; CHECK-ARMNODPS-NEXT:    movmi r4, #1
-; CHECK-ARMNODPS-NEXT:    cmp r4, #0
-; CHECK-ARMNODPS-NEXT:    mvnne lr, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r1, r7
-; CHECK-ARMNODPS-NEXT:    ldr r1, [sp, #32]
-; CHECK-ARMNODPS-NEXT:    movvc lr, r6
-; CHECK-ARMNODPS-NEXT:    mov r4, #0
-; CHECK-ARMNODPS-NEXT:    subs r6, r2, r1
-; CHECK-ARMNODPS-NEXT:    movmi r4, #1
-; CHECK-ARMNODPS-NEXT:    cmp r4, #0
-; CHECK-ARMNODPS-NEXT:    mov r4, #-2147483648
-; CHECK-ARMNODPS-NEXT:    mvnne r4, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r2, r1
-; CHECK-ARMNODPS-NEXT:    ldr r1, [sp, #36]
-; CHECK-ARMNODPS-NEXT:    movvc r4, r6
-; CHECK-ARMNODPS-NEXT:    subs r2, r3, r1
-; CHECK-ARMNODPS-NEXT:    movmi r5, #1
-; CHECK-ARMNODPS-NEXT:    cmp r5, #0
-; CHECK-ARMNODPS-NEXT:    mvnne r12, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r3, r1
-; CHECK-ARMNODPS-NEXT:    movvc r12, r2
-; CHECK-ARMNODPS-NEXT:    mov r1, lr
-; CHECK-ARMNODPS-NEXT:    mov r2, r4
-; CHECK-ARMNODPS-NEXT:    mov r3, r12
-; CHECK-ARMNODPS-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+; CHECK-ARMNODPS-NEXT:    eorvs r0, r12, r0, asr #31
+; CHECK-ARMNODPS-NEXT:    subs r1, r1, lr
+; CHECK-ARMNODPS-NEXT:    ldr lr, [sp, #16]
+; CHECK-ARMNODPS-NEXT:    eorvs r1, r12, r1, asr #31
+; CHECK-ARMNODPS-NEXT:    subs r2, r2, lr
+; CHECK-ARMNODPS-NEXT:    ldr lr, [sp, #20]
+; CHECK-ARMNODPS-NEXT:    eorvs r2, r12, r2, asr #31
+; CHECK-ARMNODPS-NEXT:    subs r3, r3, lr
+; CHECK-ARMNODPS-NEXT:    eorvs r3, r12, r3, asr #31
+; CHECK-ARMNODPS-NEXT:    pop {r11, pc}
 ;
 ; CHECK-ARMBASEDSP-LABEL: vec:
 ; CHECK-ARMBASEDSP:       @ %bb.0:

diff  --git a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
index 989e0aa4d692..5bf7b326c5b9 100644
--- a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
@@ -13,56 +13,27 @@ declare i64 @llvm.ssub.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-T1-LABEL: func32:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    .save {r4, lr}
-; CHECK-T1-NEXT:    push {r4, lr}
-; CHECK-T1-NEXT:    mov r3, r0
 ; CHECK-T1-NEXT:    muls r1, r2, r1
-; CHECK-T1-NEXT:    movs r2, #1
 ; CHECK-T1-NEXT:    subs r0, r0, r1
-; CHECK-T1-NEXT:    mov r4, r2
-; CHECK-T1-NEXT:    bmi .LBB0_2
+; CHECK-T1-NEXT:    bvc .LBB0_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    movs r4, #0
+; CHECK-T1-NEXT:    asrs r1, r0, #31
+; CHECK-T1-NEXT:    movs r0, #1
+; CHECK-T1-NEXT:    lsls r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r1
 ; CHECK-T1-NEXT:  .LBB0_2:
-; CHECK-T1-NEXT:    cmp r4, #0
-; CHECK-T1-NEXT:    bne .LBB0_4
-; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r3, r1
-; CHECK-T1-NEXT:    bvs .LBB0_5
-; CHECK-T1-NEXT:    b .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI0_0
-; CHECK-T1-NEXT:    cmp r3, r1
-; CHECK-T1-NEXT:    bvc .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_5:
-; CHECK-T1-NEXT:    mov r0, r2
-; CHECK-T1-NEXT:  .LBB0_6:
-; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI0_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
+; CHECK-T1-NEXT:    bx lr
 ;
 ; CHECK-T2NODSP-LABEL: func32:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    mls r12, r1, r2, r0
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
+; CHECK-T2NODSP-NEXT:    mls r3, r1, r2, r0
+; CHECK-T2NODSP-NEXT:    mov.w r12, #-2147483648
 ; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
-; CHECK-T2NODSP-NEXT:    cmp.w r12, #0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
 ; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r3, r12, r3, asr #31
 ; CHECK-T2NODSP-NEXT:    mov r0, r3
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func32:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -83,42 +54,35 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-T1-LABEL: func64:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    .save {r4, lr}
-; CHECK-T1-NEXT:    push {r4, lr}
-; CHECK-T1-NEXT:    ldr r2, [sp, #12]
-; CHECK-T1-NEXT:    mov r4, r1
-; CHECK-T1-NEXT:    eors r4, r2
-; CHECK-T1-NEXT:    ldr r3, [sp, #8]
-; CHECK-T1-NEXT:    subs r0, r0, r3
+; CHECK-T1-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-T1-NEXT:    push {r4, r5, r7, lr}
+; CHECK-T1-NEXT:    ldr r2, [sp, #20]
+; CHECK-T1-NEXT:    mov r5, r1
+; CHECK-T1-NEXT:    eors r5, r2
+; CHECK-T1-NEXT:    ldr r3, [sp, #16]
+; CHECK-T1-NEXT:    subs r4, r0, r3
 ; CHECK-T1-NEXT:    mov r3, r1
 ; CHECK-T1-NEXT:    sbcs r3, r2
 ; CHECK-T1-NEXT:    eors r1, r3
-; CHECK-T1-NEXT:    ands r1, r4
-; CHECK-T1-NEXT:    bpl .LBB1_2
+; CHECK-T1-NEXT:    ands r1, r5
+; CHECK-T1-NEXT:    asrs r2, r3, #31
+; CHECK-T1-NEXT:    cmp r1, #0
+; CHECK-T1-NEXT:    mov r0, r2
+; CHECK-T1-NEXT:    bmi .LBB1_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    asrs r0, r3, #31
+; CHECK-T1-NEXT:    mov r0, r4
 ; CHECK-T1-NEXT:  .LBB1_2:
-; CHECK-T1-NEXT:    cmp r3, #0
+; CHECK-T1-NEXT:    cmp r1, #0
 ; CHECK-T1-NEXT:    bmi .LBB1_4
 ; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    movs r2, #1
-; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bpl .LBB1_5
-; CHECK-T1-NEXT:    b .LBB1_6
+; CHECK-T1-NEXT:    mov r1, r3
+; CHECK-T1-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-T1-NEXT:  .LBB1_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI1_0
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bmi .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_5:
-; CHECK-T1-NEXT:    mov r2, r3
-; CHECK-T1-NEXT:  .LBB1_6:
+; CHECK-T1-NEXT:    movs r1, #1
+; CHECK-T1-NEXT:    lsls r1, r1, #31
+; CHECK-T1-NEXT:    eors r2, r1
 ; CHECK-T1-NEXT:    mov r1, r2
-; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI1_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
+; CHECK-T1-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-T2-LABEL: func64:
 ; CHECK-T2:       @ %bb.0:
@@ -128,16 +92,13 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-T2-NEXT:    sbc.w r2, r1, r12
 ; CHECK-T2-NEXT:    eor.w r3, r1, r12
 ; CHECK-T2-NEXT:    eors r1, r2
-; CHECK-T2-NEXT:    ands r3, r1
-; CHECK-T2-NEXT:    mov.w r1, #-2147483648
+; CHECK-T2-NEXT:    ands r1, r3
 ; CHECK-T2-NEXT:    it mi
 ; CHECK-T2-NEXT:    asrmi r0, r2, #31
-; CHECK-T2-NEXT:    cmp r2, #0
+; CHECK-T2-NEXT:    mov.w r1, #-2147483648
 ; CHECK-T2-NEXT:    it mi
-; CHECK-T2-NEXT:    mvnmi r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
-; CHECK-T2-NEXT:    it pl
-; CHECK-T2-NEXT:    movpl r1, r2
+; CHECK-T2-NEXT:    eormi.w r2, r1, r2, asr #31
+; CHECK-T2-NEXT:    mov r1, r2
 ; CHECK-T2-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func64:
@@ -148,13 +109,11 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-ARM-NEXT:    eor r3, r1, r2
 ; CHECK-ARM-NEXT:    sbc r2, r1, r2
 ; CHECK-ARM-NEXT:    eor r1, r1, r2
-; CHECK-ARM-NEXT:    ands r3, r3, r1
-; CHECK-ARM-NEXT:    mov r1, #-2147483648
+; CHECK-ARM-NEXT:    ands r1, r3, r1
 ; CHECK-ARM-NEXT:    asrmi r0, r2, #31
-; CHECK-ARM-NEXT:    cmp r2, #0
-; CHECK-ARM-NEXT:    mvnmi r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
-; CHECK-ARM-NEXT:    movpl r1, r2
+; CHECK-ARM-NEXT:    mov r1, #-2147483648
+; CHECK-ARM-NEXT:    eormi r2, r1, r2, asr #31
+; CHECK-ARM-NEXT:    mov r1, r2
 ; CHECK-ARM-NEXT:    bx lr
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %z)

diff  --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index 6f23d316d8ce..7cbd57768200 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -747,118 +747,114 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr {
 ; CHECK-NEXT:    vadduqm 0, 2, 6
 ; CHECK-NEXT:    xxswapd 0, 34
 ; CHECK-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    addis 3, 2, .LCPI48_0 at toc@ha
 ; CHECK-NEXT:    vadduqm 1, 3, 7
 ; CHECK-NEXT:    xxswapd 1, 35
-; CHECK-NEXT:    xxswapd 2, 32
+; CHECK-NEXT:    addi 3, 3, .LCPI48_0 at toc@l
+; CHECK-NEXT:    xxswapd 3, 32
 ; CHECK-NEXT:    mfvsrd 4, 34
-; CHECK-NEXT:    mfvsrd 9, 32
-; CHECK-NEXT:    mffprd 0, 0
+; CHECK-NEXT:    mfvsrd 8, 32
+; CHECK-NEXT:    xxswapd 2, 36
+; CHECK-NEXT:    mffprd 12, 0
 ; CHECK-NEXT:    xxswapd 0, 33
-; CHECK-NEXT:    mfvsrd 5, 38
-; CHECK-NEXT:    cmpld 9, 4
-; CHECK-NEXT:    cmpd 1, 9, 4
-; CHECK-NEXT:    vadduqm 6, 4, 8
-; CHECK-NEXT:    mffprd 4, 2
-; CHECK-NEXT:    sradi 5, 5, 63
-; CHECK-NEXT:    mffprd 30, 1
-; CHECK-NEXT:    xxswapd 1, 36
+; CHECK-NEXT:    vadduqm 10, 4, 8
+; CHECK-NEXT:    cmpld 8, 4
+; CHECK-NEXT:    cmpd 1, 8, 4
+; CHECK-NEXT:    mffprd 4, 3
+; CHECK-NEXT:    lxvd2x 3, 0, 3
+; CHECK-NEXT:    sradi 3, 8, 63
+; CHECK-NEXT:    mffprd 0, 1
+; CHECK-NEXT:    xxswapd 1, 37
+; CHECK-NEXT:    mfvsrd 5, 35
+; CHECK-NEXT:    vadduqm 11, 5, 9
+; CHECK-NEXT:    xxswapd 34, 3
+; CHECK-NEXT:    mfvsrd 9, 33
 ; CHECK-NEXT:    crandc 20, 4, 2
-; CHECK-NEXT:    cmpld 1, 4, 0
+; CHECK-NEXT:    cmpld 1, 4, 12
 ; CHECK-NEXT:    mffprd 4, 0
-; CHECK-NEXT:    xxswapd 0, 38
-; CHECK-NEXT:    mfvsrd 6, 35
-; CHECK-NEXT:    vadduqm 10, 5, 9
-; CHECK-NEXT:    cmpld 6, 4, 30
-; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; CHECK-NEXT:    mfvsrd 10, 33
-; CHECK-NEXT:    mfvsrd 7, 36
-; CHECK-NEXT:    mfvsrd 11, 38
+; CHECK-NEXT:    xxswapd 0, 42
+; CHECK-NEXT:    mfvsrd 6, 36
+; CHECK-NEXT:    mfvsrd 10, 42
+; CHECK-NEXT:    cmpld 6, 4, 0
 ; CHECK-NEXT:    crand 21, 2, 4
-; CHECK-NEXT:    cmpld 10, 6
-; CHECK-NEXT:    cmpd 1, 10, 6
-; CHECK-NEXT:    mffprd 6, 1
-; CHECK-NEXT:    xxswapd 1, 37
+; CHECK-NEXT:    cmpld 9, 5
+; CHECK-NEXT:    cmpd 1, 9, 5
+; CHECK-NEXT:    mffprd 5, 1
+; CHECK-NEXT:    xxswapd 1, 43
+; CHECK-NEXT:    mffprd 30, 2
 ; CHECK-NEXT:    mffprd 4, 0
-; CHECK-NEXT:    xxswapd 0, 42
-; CHECK-NEXT:    mfvsrd 8, 37
-; CHECK-NEXT:    mfvsrd 12, 42
+; CHECK-NEXT:    mfvsrd 7, 37
+; CHECK-NEXT:    mfvsrd 11, 43
 ; CHECK-NEXT:    crandc 22, 4, 2
-; CHECK-NEXT:    cmpd 1, 11, 7
+; CHECK-NEXT:    cmpd 1, 10, 6
 ; CHECK-NEXT:    crand 23, 2, 24
-; CHECK-NEXT:    cmpld 11, 7
+; CHECK-NEXT:    cmpld 10, 6
 ; CHECK-NEXT:    crandc 24, 4, 2
-; CHECK-NEXT:    cmpld 1, 4, 6
+; CHECK-NEXT:    cmpld 1, 4, 30
+; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    mffprd 4, 1
-; CHECK-NEXT:    mffprd 6, 0
+; CHECK-NEXT:    mfvsrd 6, 38
 ; CHECK-NEXT:    crand 25, 2, 4
-; CHECK-NEXT:    cmpld 12, 8
-; CHECK-NEXT:    cmpd 1, 12, 8
+; CHECK-NEXT:    cmpld 11, 7
+; CHECK-NEXT:    cmpd 1, 11, 7
 ; CHECK-NEXT:    crandc 26, 4, 2
-; CHECK-NEXT:    cmpld 1, 6, 4
+; CHECK-NEXT:    cmpld 1, 4, 5
+; CHECK-NEXT:    sradi 4, 6, 63
+; CHECK-NEXT:    mtfprd 0, 4
 ; CHECK-NEXT:    mfvsrd 4, 39
-; CHECK-NEXT:    mtfprd 0, 5
+; CHECK-NEXT:    mfvsrd 5, 40
+; CHECK-NEXT:    mfvsrd 6, 41
 ; CHECK-NEXT:    sradi 4, 4, 63
-; CHECK-NEXT:    mfvsrd 5, 41
 ; CHECK-NEXT:    mtfprd 1, 4
-; CHECK-NEXT:    xxspltd 34, 0, 0
-; CHECK-NEXT:    mfvsrd 4, 40
+; CHECK-NEXT:    sradi 4, 5, 63
+; CHECK-NEXT:    mtfprd 2, 4
+; CHECK-NEXT:    sradi 4, 6, 63
+; CHECK-NEXT:    mtfprd 5, 3
+; CHECK-NEXT:    sradi 3, 10, 63
+; CHECK-NEXT:    mtfprd 4, 4
+; CHECK-NEXT:    sradi 4, 9, 63
+; CHECK-NEXT:    mtfprd 6, 4
+; CHECK-NEXT:    xxspltd 35, 5, 0
+; CHECK-NEXT:    sradi 4, 11, 63
 ; CHECK-NEXT:    crnor 20, 21, 20
-; CHECK-NEXT:    sradi 4, 4, 63
+; CHECK-NEXT:    xxspltd 38, 4, 0
+; CHECK-NEXT:    mtfprd 3, 3
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    xxspltd 36, 6, 0
+; CHECK-NEXT:    mtfprd 5, 4
 ; CHECK-NEXT:    crand 27, 2, 4
-; CHECK-NEXT:    mtfprd 2, 4
-; CHECK-NEXT:    sradi 4, 5, 63
-; CHECK-NEXT:    sradi 5, 10, 63
-; CHECK-NEXT:    mtfprd 3, 4
+; CHECK-NEXT:    xxspltd 37, 3, 0
+; CHECK-NEXT:    xxlxor 3, 35, 34
+; CHECK-NEXT:    xxspltd 35, 5, 0
 ; CHECK-NEXT:    isel 4, 0, 3, 20
-; CHECK-NEXT:    xxspltd 36, 2, 0
+; CHECK-NEXT:    mtfprd 8, 4
 ; CHECK-NEXT:    crnor 20, 23, 22
-; CHECK-NEXT:    mtfprd 4, 4
-; CHECK-NEXT:    sradi 4, 9, 63
-; CHECK-NEXT:    mtfprd 0, 4
-; CHECK-NEXT:    addis 4, 2, .LCPI48_0 at toc@ha
-; CHECK-NEXT:    mtfprd 5, 5
-; CHECK-NEXT:    xxspltd 35, 4, 0
-; CHECK-NEXT:    addi 4, 4, .LCPI48_0 at toc@l
-; CHECK-NEXT:    isel 5, 0, 3, 20
-; CHECK-NEXT:    lxvd2x 6, 0, 4
-; CHECK-NEXT:    mtfprd 4, 5
-; CHECK-NEXT:    addis 5, 2, .LCPI48_1 at toc@ha
-; CHECK-NEXT:    xxspltd 37, 5, 0
-; CHECK-NEXT:    addi 4, 5, .LCPI48_1 at toc@l
-; CHECK-NEXT:    xxlxor 7, 34, 35
+; CHECK-NEXT:    crnor 21, 25, 24
+; CHECK-NEXT:    crnor 22, 27, 26
+; CHECK-NEXT:    xxlxor 5, 36, 34
+; CHECK-NEXT:    xxspltd 36, 2, 0
+; CHECK-NEXT:    xxlxor 6, 37, 34
+; CHECK-NEXT:    xxlxor 7, 35, 34
+; CHECK-NEXT:    xxspltd 34, 0, 0
+; CHECK-NEXT:    xxspltd 35, 8, 0
+; CHECK-NEXT:    isel 4, 0, 3, 20
+; CHECK-NEXT:    isel 5, 0, 3, 21
+; CHECK-NEXT:    isel 3, 0, 3, 22
+; CHECK-NEXT:    xxlxor 0, 34, 35
 ; CHECK-NEXT:    xxspltd 34, 1, 0
-; CHECK-NEXT:    sradi 5, 11, 63
-; CHECK-NEXT:    lxvd2x 8, 0, 4
-; CHECK-NEXT:    xxspltd 35, 4, 0
-; CHECK-NEXT:    crnor 20, 25, 24
-; CHECK-NEXT:    sradi 4, 12, 63
-; CHECK-NEXT:    crnor 21, 27, 26
-; CHECK-NEXT:    xxswapd 4, 6
+; CHECK-NEXT:    mtfprd 8, 4
 ; CHECK-NEXT:    mtfprd 1, 5
-; CHECK-NEXT:    mtfprd 9, 4
-; CHECK-NEXT:    xxswapd 6, 8
-; CHECK-NEXT:    xxlxor 2, 34, 35
-; CHECK-NEXT:    xxspltd 35, 0, 0
-; CHECK-NEXT:    isel 4, 0, 3, 20
-; CHECK-NEXT:    xxspltd 39, 1, 0
-; CHECK-NEXT:    isel 3, 0, 3, 21
-; CHECK-NEXT:    xxspltd 40, 9, 0
-; CHECK-NEXT:    mtfprd 0, 4
-; CHECK-NEXT:    xxspltd 34, 3, 0
-; CHECK-NEXT:    mtfprd 1, 3
-; CHECK-NEXT:    xxsel 3, 6, 4, 39
-; CHECK-NEXT:    xxspltd 41, 0, 0
-; CHECK-NEXT:    xxsel 0, 6, 4, 35
-; CHECK-NEXT:    xxspltd 35, 1, 0
-; CHECK-NEXT:    xxsel 1, 6, 4, 37
-; CHECK-NEXT:    xxsel 4, 6, 4, 40
-; CHECK-NEXT:    xxlxor 5, 36, 41
-; CHECK-NEXT:    xxlxor 6, 34, 35
-; CHECK-NEXT:    xxsel 34, 32, 0, 7
-; CHECK-NEXT:    xxsel 35, 33, 1, 2
-; CHECK-NEXT:    xxsel 36, 38, 3, 5
-; CHECK-NEXT:    xxsel 37, 42, 4, 6
+; CHECK-NEXT:    mtfprd 9, 3
+; CHECK-NEXT:    xxspltd 35, 8, 0
+; CHECK-NEXT:    xxspltd 37, 1, 0
+; CHECK-NEXT:    xxspltd 39, 9, 0
+; CHECK-NEXT:    xxlxor 1, 34, 35
+; CHECK-NEXT:    xxsel 34, 32, 3, 0
+; CHECK-NEXT:    xxlxor 2, 36, 37
+; CHECK-NEXT:    xxlxor 4, 38, 39
+; CHECK-NEXT:    xxsel 35, 33, 5, 1
+; CHECK-NEXT:    xxsel 36, 42, 6, 2
+; CHECK-NEXT:    xxsel 37, 43, 7, 4
 ; CHECK-NEXT:    blr
   %c = call <4 x i128> @llvm.sadd.sat.v4i128(<4 x i128> %a, <4 x i128> %b)
   ret <4 x i128> %c

diff  --git a/llvm/test/CodeGen/RISCV/sadd_sat.ll b/llvm/test/CodeGen/RISCV/sadd_sat.ll
index be326d215577..4ced792c37b3 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat.ll
@@ -21,9 +21,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32I-NEXT:    slti a1, a1, 0
 ; RV32I-NEXT:    beq a1, a2, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:  .LBB0_2:
 ; RV32I-NEXT:    ret
 ;
@@ -52,9 +52,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32IZbbNOZbt-NEXT:    slti a1, a1, 0
 ; RV32IZbbNOZbt-NEXT:    beq a1, a2, .LBB0_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV32IZbbNOZbt-NEXT:    srai a0, a0, 31
 ; RV32IZbbNOZbt-NEXT:    lui a1, 524288
-; RV32IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV32IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB0_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -70,14 +70,13 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32IZbbZbt-LABEL: func:
 ; RV32IZbbZbt:       # %bb.0:
 ; RV32IZbbZbt-NEXT:    add a2, a0, a1
-; RV32IZbbZbt-NEXT:    slti a3, a2, 0
-; RV32IZbbZbt-NEXT:    lui a4, 524288
-; RV32IZbbZbt-NEXT:    addi a5, a4, -1
-; RV32IZbbZbt-NEXT:    cmov a3, a3, a5, a4
 ; RV32IZbbZbt-NEXT:    slt a0, a2, a0
 ; RV32IZbbZbt-NEXT:    slti a1, a1, 0
 ; RV32IZbbZbt-NEXT:    xor a0, a1, a0
-; RV32IZbbZbt-NEXT:    cmov a0, a0, a3, a2
+; RV32IZbbZbt-NEXT:    srai a1, a2, 31
+; RV32IZbbZbt-NEXT:    lui a3, 524288
+; RV32IZbbZbt-NEXT:    xor a1, a1, a3
+; RV32IZbbZbt-NEXT:    cmov a0, a0, a1, a2
 ; RV32IZbbZbt-NEXT:    ret
   %tmp = call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y);
   ret i32 %tmp;
@@ -98,11 +97,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    bgez a2, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    sub a2, a2, a0
 ; RV32I-NEXT:    srai a0, a1, 31
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    xor a1, a0, a1
 ; RV32I-NEXT:  .LBB1_2:
 ; RV32I-NEXT:    ret
 ;
@@ -114,10 +111,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64I-NEXT:    slti a1, a1, 0
 ; RV64I-NEXT:    beq a1, a2, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    addi a1, zero, -1
 ; RV64I-NEXT:    slli a1, a1, 63
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:  .LBB1_2:
 ; RV64I-NEXT:    ret
 ;
@@ -134,11 +131,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbbNOZbt-NEXT:    andn a2, a2, a3
 ; RV32IZbbNOZbt-NEXT:    bgez a2, .LBB1_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a1, 0
-; RV32IZbbNOZbt-NEXT:    lui a2, 524288
-; RV32IZbbNOZbt-NEXT:    sub a2, a2, a0
 ; RV32IZbbNOZbt-NEXT:    srai a0, a1, 31
-; RV32IZbbNOZbt-NEXT:    mv a1, a2
+; RV32IZbbNOZbt-NEXT:    lui a1, 524288
+; RV32IZbbNOZbt-NEXT:    xor a1, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -150,10 +145,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64IZbbNOZbt-NEXT:    slti a1, a1, 0
 ; RV64IZbbNOZbt-NEXT:    beq a1, a2, .LBB1_2
 ; RV64IZbbNOZbt-NEXT:  # %bb.1:
-; RV64IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV64IZbbNOZbt-NEXT:    srai a0, a0, 63
 ; RV64IZbbNOZbt-NEXT:    addi a1, zero, -1
 ; RV64IZbbNOZbt-NEXT:    slli a1, a1, 63
-; RV64IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV64IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV64IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV64IZbbNOZbt-NEXT:    ret
 ;
@@ -163,31 +158,28 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbbZbt-NEXT:    add a2, a0, a2
 ; RV32IZbbZbt-NEXT:    sltu a0, a2, a0
 ; RV32IZbbZbt-NEXT:    add a0, a4, a0
-; RV32IZbbZbt-NEXT:    slti a4, a0, 0
-; RV32IZbbZbt-NEXT:    lui a6, 524288
-; RV32IZbbZbt-NEXT:    addi a5, a6, -1
-; RV32IZbbZbt-NEXT:    cmov a4, a4, a5, a6
+; RV32IZbbZbt-NEXT:    srai a4, a0, 31
+; RV32IZbbZbt-NEXT:    lui a5, 524288
+; RV32IZbbZbt-NEXT:    xor a6, a4, a5
 ; RV32IZbbZbt-NEXT:    xor a5, a1, a0
 ; RV32IZbbZbt-NEXT:    xor a1, a1, a3
 ; RV32IZbbZbt-NEXT:    andn a1, a5, a1
 ; RV32IZbbZbt-NEXT:    slti a3, a1, 0
-; RV32IZbbZbt-NEXT:    cmov a1, a3, a4, a0
-; RV32IZbbZbt-NEXT:    srai a0, a0, 31
-; RV32IZbbZbt-NEXT:    cmov a0, a3, a0, a2
+; RV32IZbbZbt-NEXT:    cmov a1, a3, a6, a0
+; RV32IZbbZbt-NEXT:    cmov a0, a3, a4, a2
 ; RV32IZbbZbt-NEXT:    ret
 ;
 ; RV64IZbbZbt-LABEL: func2:
 ; RV64IZbbZbt:       # %bb.0:
 ; RV64IZbbZbt-NEXT:    add a2, a0, a1
-; RV64IZbbZbt-NEXT:    slti a3, a2, 0
-; RV64IZbbZbt-NEXT:    addi a4, zero, -1
-; RV64IZbbZbt-NEXT:    slli a5, a4, 63
-; RV64IZbbZbt-NEXT:    srli a4, a4, 1
-; RV64IZbbZbt-NEXT:    cmov a3, a3, a4, a5
 ; RV64IZbbZbt-NEXT:    slt a0, a2, a0
 ; RV64IZbbZbt-NEXT:    slti a1, a1, 0
 ; RV64IZbbZbt-NEXT:    xor a0, a1, a0
-; RV64IZbbZbt-NEXT:    cmov a0, a0, a3, a2
+; RV64IZbbZbt-NEXT:    srai a1, a2, 63
+; RV64IZbbZbt-NEXT:    addi a3, zero, -1
+; RV64IZbbZbt-NEXT:    slli a3, a3, 63
+; RV64IZbbZbt-NEXT:    xor a1, a1, a3
+; RV64IZbbZbt-NEXT:    cmov a0, a0, a1, a2
 ; RV64IZbbZbt-NEXT:    ret
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y);
   ret i64 %tmp;

diff  --git a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
index eb79a95e63a2..eb0988ecbc93 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
@@ -22,9 +22,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32I-NEXT:    slti a1, a1, 0
 ; RV32I-NEXT:    beq a1, a2, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:  .LBB0_2:
 ; RV32I-NEXT:    ret
 ;
@@ -56,9 +56,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32IZbbNOZbt-NEXT:    slti a1, a1, 0
 ; RV32IZbbNOZbt-NEXT:    beq a1, a2, .LBB0_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV32IZbbNOZbt-NEXT:    srai a0, a0, 31
 ; RV32IZbbNOZbt-NEXT:    lui a1, 524288
-; RV32IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV32IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB0_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -80,10 +80,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32IZbbZbt-NEXT:    slt a0, a2, a0
 ; RV32IZbbZbt-NEXT:    slti a1, a1, 0
 ; RV32IZbbZbt-NEXT:    xor a0, a1, a0
-; RV32IZbbZbt-NEXT:    slti a1, a2, 0
+; RV32IZbbZbt-NEXT:    srai a1, a2, 31
 ; RV32IZbbZbt-NEXT:    lui a3, 524288
-; RV32IZbbZbt-NEXT:    addi a4, a3, -1
-; RV32IZbbZbt-NEXT:    cmov a1, a1, a4, a3
+; RV32IZbbZbt-NEXT:    xor a1, a1, a3
 ; RV32IZbbZbt-NEXT:    cmov a0, a0, a1, a2
 ; RV32IZbbZbt-NEXT:    ret
   %a = mul i32 %y, %z
@@ -106,11 +105,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    bgez a2, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    sub a2, a2, a0
 ; RV32I-NEXT:    srai a0, a1, 31
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    xor a1, a0, a1
 ; RV32I-NEXT:  .LBB1_2:
 ; RV32I-NEXT:    ret
 ;
@@ -122,10 +119,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64I-NEXT:    slti a2, a2, 0
 ; RV64I-NEXT:    beq a2, a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    addi a1, zero, -1
 ; RV64I-NEXT:    slli a1, a1, 63
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:  .LBB1_2:
 ; RV64I-NEXT:    ret
 ;
@@ -142,11 +139,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbbNOZbt-NEXT:    andn a2, a3, a2
 ; RV32IZbbNOZbt-NEXT:    bgez a2, .LBB1_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a1, 0
-; RV32IZbbNOZbt-NEXT:    lui a2, 524288
-; RV32IZbbNOZbt-NEXT:    sub a2, a2, a0
 ; RV32IZbbNOZbt-NEXT:    srai a0, a1, 31
-; RV32IZbbNOZbt-NEXT:    mv a1, a2
+; RV32IZbbNOZbt-NEXT:    lui a1, 524288
+; RV32IZbbNOZbt-NEXT:    xor a1, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -158,10 +153,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64IZbbNOZbt-NEXT:    slti a2, a2, 0
 ; RV64IZbbNOZbt-NEXT:    beq a2, a1, .LBB1_2
 ; RV64IZbbNOZbt-NEXT:  # %bb.1:
-; RV64IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV64IZbbNOZbt-NEXT:    srai a0, a0, 63
 ; RV64IZbbNOZbt-NEXT:    addi a1, zero, -1
 ; RV64IZbbNOZbt-NEXT:    slli a1, a1, 63
-; RV64IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV64IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV64IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV64IZbbNOZbt-NEXT:    ret
 ;
@@ -171,31 +166,28 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbbZbt-NEXT:    add a3, a0, a4
 ; RV32IZbbZbt-NEXT:    sltu a0, a3, a0
 ; RV32IZbbZbt-NEXT:    add a0, a2, a0
-; RV32IZbbZbt-NEXT:    slti a2, a0, 0
-; RV32IZbbZbt-NEXT:    lui a6, 524288
-; RV32IZbbZbt-NEXT:    addi a4, a6, -1
-; RV32IZbbZbt-NEXT:    cmov a2, a2, a4, a6
+; RV32IZbbZbt-NEXT:    srai a2, a0, 31
+; RV32IZbbZbt-NEXT:    lui a4, 524288
+; RV32IZbbZbt-NEXT:    xor a6, a2, a4
 ; RV32IZbbZbt-NEXT:    xor a4, a1, a0
 ; RV32IZbbZbt-NEXT:    xor a1, a1, a5
 ; RV32IZbbZbt-NEXT:    andn a1, a4, a1
 ; RV32IZbbZbt-NEXT:    slti a4, a1, 0
-; RV32IZbbZbt-NEXT:    cmov a1, a4, a2, a0
-; RV32IZbbZbt-NEXT:    srai a0, a0, 31
-; RV32IZbbZbt-NEXT:    cmov a0, a4, a0, a3
+; RV32IZbbZbt-NEXT:    cmov a1, a4, a6, a0
+; RV32IZbbZbt-NEXT:    cmov a0, a4, a2, a3
 ; RV32IZbbZbt-NEXT:    ret
 ;
 ; RV64IZbbZbt-LABEL: func64:
 ; RV64IZbbZbt:       # %bb.0:
 ; RV64IZbbZbt-NEXT:    add a1, a0, a2
-; RV64IZbbZbt-NEXT:    slti a3, a1, 0
-; RV64IZbbZbt-NEXT:    addi a4, zero, -1
-; RV64IZbbZbt-NEXT:    slli a5, a4, 63
-; RV64IZbbZbt-NEXT:    srli a4, a4, 1
-; RV64IZbbZbt-NEXT:    cmov a3, a3, a4, a5
 ; RV64IZbbZbt-NEXT:    slt a0, a1, a0
 ; RV64IZbbZbt-NEXT:    slti a2, a2, 0
 ; RV64IZbbZbt-NEXT:    xor a0, a2, a0
-; RV64IZbbZbt-NEXT:    cmov a0, a0, a3, a1
+; RV64IZbbZbt-NEXT:    srai a2, a1, 63
+; RV64IZbbZbt-NEXT:    addi a3, zero, -1
+; RV64IZbbZbt-NEXT:    slli a3, a3, 63
+; RV64IZbbZbt-NEXT:    xor a2, a2, a3
+; RV64IZbbZbt-NEXT:    cmov a0, a0, a2, a1
 ; RV64IZbbZbt-NEXT:    ret
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %z)

diff  --git a/llvm/test/CodeGen/RISCV/ssub_sat.ll b/llvm/test/CodeGen/RISCV/ssub_sat.ll
index d845f87fd592..94acffac7a81 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat.ll
@@ -21,9 +21,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32I-NEXT:    slt a1, a0, a2
 ; RV32I-NEXT:    beq a3, a1, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:  .LBB0_2:
 ; RV32I-NEXT:    ret
 ;
@@ -52,9 +52,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32IZbbNOZbt-NEXT:    slt a1, a0, a2
 ; RV32IZbbNOZbt-NEXT:    beq a3, a1, .LBB0_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV32IZbbNOZbt-NEXT:    srai a0, a0, 31
 ; RV32IZbbNOZbt-NEXT:    lui a1, 524288
-; RV32IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV32IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB0_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -73,10 +73,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32IZbbZbt-NEXT:    sub a1, a0, a1
 ; RV32IZbbZbt-NEXT:    slt a0, a1, a0
 ; RV32IZbbZbt-NEXT:    xor a0, a2, a0
-; RV32IZbbZbt-NEXT:    slti a2, a1, 0
+; RV32IZbbZbt-NEXT:    srai a2, a1, 31
 ; RV32IZbbZbt-NEXT:    lui a3, 524288
-; RV32IZbbZbt-NEXT:    addi a4, a3, -1
-; RV32IZbbZbt-NEXT:    cmov a2, a2, a4, a3
+; RV32IZbbZbt-NEXT:    xor a2, a2, a3
 ; RV32IZbbZbt-NEXT:    cmov a0, a0, a2, a1
 ; RV32IZbbZbt-NEXT:    ret
   %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y);
@@ -98,11 +97,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB1_2:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    sub a2, a2, a0
 ; RV32I-NEXT:    srai a0, a1, 31
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    xor a1, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func2:
@@ -113,10 +110,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64I-NEXT:    slt a1, a0, a2
 ; RV64I-NEXT:    beq a3, a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    addi a1, zero, -1
 ; RV64I-NEXT:    slli a1, a1, 63
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:  .LBB1_2:
 ; RV64I-NEXT:    ret
 ;
@@ -134,11 +131,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbbNOZbt-NEXT:    sub a0, a0, a2
 ; RV32IZbbNOZbt-NEXT:    ret
 ; RV32IZbbNOZbt-NEXT:  .LBB1_2:
-; RV32IZbbNOZbt-NEXT:    slti a0, a1, 0
-; RV32IZbbNOZbt-NEXT:    lui a2, 524288
-; RV32IZbbNOZbt-NEXT:    sub a2, a2, a0
 ; RV32IZbbNOZbt-NEXT:    srai a0, a1, 31
-; RV32IZbbNOZbt-NEXT:    mv a1, a2
+; RV32IZbbNOZbt-NEXT:    lui a1, 524288
+; RV32IZbbNOZbt-NEXT:    xor a1, a0, a1
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
 ; RV64IZbbNOZbt-LABEL: func2:
@@ -149,10 +144,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64IZbbNOZbt-NEXT:    slt a1, a0, a2
 ; RV64IZbbNOZbt-NEXT:    beq a3, a1, .LBB1_2
 ; RV64IZbbNOZbt-NEXT:  # %bb.1:
-; RV64IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV64IZbbNOZbt-NEXT:    srai a0, a0, 63
 ; RV64IZbbNOZbt-NEXT:    addi a1, zero, -1
 ; RV64IZbbNOZbt-NEXT:    slli a1, a1, 63
-; RV64IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV64IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV64IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV64IZbbNOZbt-NEXT:    ret
 ;
@@ -161,18 +156,16 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbbZbt-NEXT:    sltu a4, a0, a2
 ; RV32IZbbZbt-NEXT:    sub a5, a1, a3
 ; RV32IZbbZbt-NEXT:    sub a4, a5, a4
-; RV32IZbbZbt-NEXT:    slti a7, a4, 0
-; RV32IZbbZbt-NEXT:    lui a6, 524288
-; RV32IZbbZbt-NEXT:    addi a5, a6, -1
-; RV32IZbbZbt-NEXT:    cmov a6, a7, a5, a6
+; RV32IZbbZbt-NEXT:    srai a6, a4, 31
+; RV32IZbbZbt-NEXT:    lui a5, 524288
+; RV32IZbbZbt-NEXT:    xor a7, a6, a5
 ; RV32IZbbZbt-NEXT:    xor a5, a1, a4
 ; RV32IZbbZbt-NEXT:    xor a1, a1, a3
 ; RV32IZbbZbt-NEXT:    and a1, a1, a5
 ; RV32IZbbZbt-NEXT:    slti a3, a1, 0
-; RV32IZbbZbt-NEXT:    cmov a1, a3, a6, a4
-; RV32IZbbZbt-NEXT:    srai a4, a4, 31
+; RV32IZbbZbt-NEXT:    cmov a1, a3, a7, a4
 ; RV32IZbbZbt-NEXT:    sub a0, a0, a2
-; RV32IZbbZbt-NEXT:    cmov a0, a3, a4, a0
+; RV32IZbbZbt-NEXT:    cmov a0, a3, a6, a0
 ; RV32IZbbZbt-NEXT:    ret
 ;
 ; RV64IZbbZbt-LABEL: func2:
@@ -181,11 +174,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64IZbbZbt-NEXT:    sub a1, a0, a1
 ; RV64IZbbZbt-NEXT:    slt a0, a1, a0
 ; RV64IZbbZbt-NEXT:    xor a0, a2, a0
-; RV64IZbbZbt-NEXT:    slti a2, a1, 0
+; RV64IZbbZbt-NEXT:    srai a2, a1, 63
 ; RV64IZbbZbt-NEXT:    addi a3, zero, -1
-; RV64IZbbZbt-NEXT:    slli a4, a3, 63
-; RV64IZbbZbt-NEXT:    srli a3, a3, 1
-; RV64IZbbZbt-NEXT:    cmov a2, a2, a3, a4
+; RV64IZbbZbt-NEXT:    slli a3, a3, 63
+; RV64IZbbZbt-NEXT:    xor a2, a2, a3
 ; RV64IZbbZbt-NEXT:    cmov a0, a0, a2, a1
 ; RV64IZbbZbt-NEXT:    ret
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y);

diff  --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
index 605d7eeef2a3..fa93cc0e7e88 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
@@ -22,9 +22,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32I-NEXT:    slt a2, a0, a3
 ; RV32I-NEXT:    beq a1, a2, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:  .LBB0_2:
 ; RV32I-NEXT:    ret
 ;
@@ -56,9 +56,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32IZbbNOZbt-NEXT:    slt a2, a0, a3
 ; RV32IZbbNOZbt-NEXT:    beq a1, a2, .LBB0_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV32IZbbNOZbt-NEXT:    srai a0, a0, 31
 ; RV32IZbbNOZbt-NEXT:    lui a1, 524288
-; RV32IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV32IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB0_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -80,10 +80,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32IZbbZbt-NEXT:    sub a1, a0, a1
 ; RV32IZbbZbt-NEXT:    slt a0, a1, a0
 ; RV32IZbbZbt-NEXT:    xor a0, a2, a0
-; RV32IZbbZbt-NEXT:    slti a2, a1, 0
+; RV32IZbbZbt-NEXT:    srai a2, a1, 31
 ; RV32IZbbZbt-NEXT:    lui a3, 524288
-; RV32IZbbZbt-NEXT:    addi a4, a3, -1
-; RV32IZbbZbt-NEXT:    cmov a2, a2, a4, a3
+; RV32IZbbZbt-NEXT:    xor a2, a2, a3
 ; RV32IZbbZbt-NEXT:    cmov a0, a0, a2, a1
 ; RV32IZbbZbt-NEXT:    ret
   %a = mul i32 %y, %z
@@ -106,11 +105,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32I-NEXT:    sub a0, a0, a4
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB1_2:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    sub a2, a2, a0
 ; RV32I-NEXT:    srai a0, a1, 31
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    xor a1, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func64:
@@ -121,10 +118,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64I-NEXT:    slt a1, a0, a1
 ; RV64I-NEXT:    beq a3, a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    addi a1, zero, -1
 ; RV64I-NEXT:    slli a1, a1, 63
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:  .LBB1_2:
 ; RV64I-NEXT:    ret
 ;
@@ -142,11 +139,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbbNOZbt-NEXT:    sub a0, a0, a4
 ; RV32IZbbNOZbt-NEXT:    ret
 ; RV32IZbbNOZbt-NEXT:  .LBB1_2:
-; RV32IZbbNOZbt-NEXT:    slti a0, a1, 0
-; RV32IZbbNOZbt-NEXT:    lui a2, 524288
-; RV32IZbbNOZbt-NEXT:    sub a2, a2, a0
 ; RV32IZbbNOZbt-NEXT:    srai a0, a1, 31
-; RV32IZbbNOZbt-NEXT:    mv a1, a2
+; RV32IZbbNOZbt-NEXT:    lui a1, 524288
+; RV32IZbbNOZbt-NEXT:    xor a1, a0, a1
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
 ; RV64IZbbNOZbt-LABEL: func64:
@@ -157,10 +152,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64IZbbNOZbt-NEXT:    slt a1, a0, a1
 ; RV64IZbbNOZbt-NEXT:    beq a3, a1, .LBB1_2
 ; RV64IZbbNOZbt-NEXT:  # %bb.1:
-; RV64IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV64IZbbNOZbt-NEXT:    srai a0, a0, 63
 ; RV64IZbbNOZbt-NEXT:    addi a1, zero, -1
 ; RV64IZbbNOZbt-NEXT:    slli a1, a1, 63
-; RV64IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV64IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV64IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV64IZbbNOZbt-NEXT:    ret
 ;
@@ -169,18 +164,16 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbbZbt-NEXT:    sltu a2, a0, a4
 ; RV32IZbbZbt-NEXT:    sub a3, a1, a5
 ; RV32IZbbZbt-NEXT:    sub a2, a3, a2
-; RV32IZbbZbt-NEXT:    slti a7, a2, 0
-; RV32IZbbZbt-NEXT:    lui a6, 524288
-; RV32IZbbZbt-NEXT:    addi a3, a6, -1
-; RV32IZbbZbt-NEXT:    cmov a6, a7, a3, a6
+; RV32IZbbZbt-NEXT:    srai a6, a2, 31
+; RV32IZbbZbt-NEXT:    lui a3, 524288
+; RV32IZbbZbt-NEXT:    xor a7, a6, a3
 ; RV32IZbbZbt-NEXT:    xor a3, a1, a2
 ; RV32IZbbZbt-NEXT:    xor a1, a1, a5
 ; RV32IZbbZbt-NEXT:    and a1, a1, a3
 ; RV32IZbbZbt-NEXT:    slti a3, a1, 0
-; RV32IZbbZbt-NEXT:    cmov a1, a3, a6, a2
-; RV32IZbbZbt-NEXT:    srai a2, a2, 31
+; RV32IZbbZbt-NEXT:    cmov a1, a3, a7, a2
 ; RV32IZbbZbt-NEXT:    sub a0, a0, a4
-; RV32IZbbZbt-NEXT:    cmov a0, a3, a2, a0
+; RV32IZbbZbt-NEXT:    cmov a0, a3, a6, a0
 ; RV32IZbbZbt-NEXT:    ret
 ;
 ; RV64IZbbZbt-LABEL: func64:
@@ -189,11 +182,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64IZbbZbt-NEXT:    sub a2, a0, a2
 ; RV64IZbbZbt-NEXT:    slt a0, a2, a0
 ; RV64IZbbZbt-NEXT:    xor a0, a1, a0
-; RV64IZbbZbt-NEXT:    slti a1, a2, 0
+; RV64IZbbZbt-NEXT:    srai a1, a2, 63
 ; RV64IZbbZbt-NEXT:    addi a3, zero, -1
-; RV64IZbbZbt-NEXT:    slli a4, a3, 63
-; RV64IZbbZbt-NEXT:    srli a3, a3, 1
-; RV64IZbbZbt-NEXT:    cmov a1, a1, a3, a4
+; RV64IZbbZbt-NEXT:    slli a3, a3, 63
+; RV64IZbbZbt-NEXT:    xor a1, a1, a3
 ; RV64IZbbZbt-NEXT:    cmov a0, a0, a1, a2
 ; RV64IZbbZbt-NEXT:    ret
   %a = mul i64 %y, %z

diff  --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index c960979a3703..198a4d8a4d56 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -60,20 +60,14 @@ define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    asrne r3, r1, #31
-; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    cset r2, mi
-; CHECK-NEXT:    mvn r3, #-2147483648
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cinv r2, r3, eq
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r0, r2, r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r2, mi
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cinv r2, r3, eq
+; CHECK-NEXT:    mov.w r2, #-2147483648
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne.w r0, r2, r0, asr #31
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csel r1, r2, r1, ne
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne.w r1, r2, r1, asr #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -183,40 +177,34 @@ define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    subs r2, r2, r0
 ; CHECK-NEXT:    eor.w r12, r3, r1
-; CHECK-NEXT:    sbc.w r1, r3, r1
-; CHECK-NEXT:    eor.w r2, r3, r1
+; CHECK-NEXT:    sbc.w r0, r3, r1
+; CHECK-NEXT:    eor.w r1, r3, r0
 ; CHECK-NEXT:    vmov r3, r4, d0
-; CHECK-NEXT:    ands.w r2, r2, r12
-; CHECK-NEXT:    vmov lr, r2, d2
+; CHECK-NEXT:    ands.w r1, r1, r12
+; CHECK-NEXT:    vmov lr, r1, d2
 ; CHECK-NEXT:    cset r12, mi
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r0, r1, #31
+; CHECK-NEXT:    asrne r2, r0, #31
 ; CHECK-NEXT:    subs.w r3, r3, lr
-; CHECK-NEXT:    eor.w r5, r4, r2
-; CHECK-NEXT:    sbc.w r2, r4, r2
-; CHECK-NEXT:    eors r4, r2
+; CHECK-NEXT:    eor.w r5, r4, r1
+; CHECK-NEXT:    sbc.w r1, r4, r1
+; CHECK-NEXT:    eors r4, r1
 ; CHECK-NEXT:    ands r5, r4
 ; CHECK-NEXT:    cset r5, mi
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r3, r2, #31
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
-; CHECK-NEXT:    cset r0, mi
-; CHECK-NEXT:    mvn r3, #-2147483648
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cinv r0, r3, eq
+; CHECK-NEXT:    asrne r3, r1, #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r0, r0, r1, ne
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r1, mi
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cinv r1, r3, eq
+; CHECK-NEXT:    mov.w r2, #-2147483648
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne.w r0, r2, r0, asr #31
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csel r1, r1, r2, ne
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne.w r1, r2, r1, asr #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:

diff  --git a/llvm/test/CodeGen/X86/combine-add-ssat.ll b/llvm/test/CodeGen/X86/combine-add-ssat.ll
index 48f5d6c76096..be52956a8106 100644
--- a/llvm/test/CodeGen/X86/combine-add-ssat.ll
+++ b/llvm/test/CodeGen/X86/combine-add-ssat.ll
@@ -77,11 +77,10 @@ define <8 x i16> @combine_constfold_undef_v8i16() {
 define i32 @combine_constant_i32(i32 %a0) {
 ; CHECK-LABEL: combine_constant_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    incl %ecx
-; CHECK-NEXT:    setns %al
-; CHECK-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal 1(%rdi), %eax
+; CHECK-NEXT:    sarl $31, %eax
+; CHECK-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
 ; CHECK-NEXT:    incl %edi
 ; CHECK-NEXT:    cmovnol %edi, %eax
 ; CHECK-NEXT:    retq
@@ -125,13 +124,13 @@ define <8 x i16> @combine_zero_v8i16(<8 x i16> %a0) {
 define i32 @combine_no_overflow_i32(i32 %a0, i32 %a1) {
 ; CHECK-LABEL: combine_no_overflow_i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    sarl $16, %edi
 ; CHECK-NEXT:    shrl $16, %esi
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    addl %esi, %ecx
-; CHECK-NEXT:    setns %al
-; CHECK-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK-NEXT:    leal (%rdi,%rsi), %eax
+; CHECK-NEXT:    sarl $31, %eax
+; CHECK-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
 ; CHECK-NEXT:    addl %edi, %esi
 ; CHECK-NEXT:    cmovnol %esi, %eax
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/sadd_sat.ll b/llvm/test/CodeGen/X86/sadd_sat.ll
index e866c8256442..e839c5e18bc3 100644
--- a/llvm/test/CodeGen/X86/sadd_sat.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat.ll
@@ -12,26 +12,22 @@ declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    cmovol %ecx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%eax,%ecx), %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    cmovol %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    addl %esi, %ecx
-; X64-NEXT:    setns %al
-; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    sarl $31, %eax
+; X64-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
 ; X64-NEXT:    addl %esi, %edi
 ; X64-NEXT:    cmovnol %edi, %eax
 ; X64-NEXT:    retq
@@ -43,34 +39,27 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-LABEL: func2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seto %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setns %dl
-; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    popl %esi
+; X86-NEXT:    cmovel %ecx, %edx
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func2:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    setns %cl
-; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    leaq (%rdi,%rsi), %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-NEXT:    xorq %rcx, %rax
 ; X64-NEXT:    addq %rsi, %rdi
 ; X64-NEXT:    cmovnoq %rdi, %rax
 ; X64-NEXT:    retq
@@ -81,27 +70,26 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind {
 ; X86-LABEL: func16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addw %dx, %si
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    addw %dx, %ax
-; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addw %cx, %dx
+; X86-NEXT:    movswl %dx, %edx
+; X86-NEXT:    sarl $15, %edx
+; X86-NEXT:    xorl $-32768, %edx # imm = 0x8000
+; X86-NEXT:    addw %cx, %ax
+; X86-NEXT:    cmovol %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func16:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    addw %si, %cx
-; X64-NEXT:    setns %al
-; X64-NEXT:    addl $32767, %eax # imm = 0x7FFF
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    cwtl
+; X64-NEXT:    sarl $15, %eax
+; X64-NEXT:    xorl $-32768, %eax # imm = 0x8000
 ; X64-NEXT:    addw %si, %di
 ; X64-NEXT:    cmovnol %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -114,28 +102,29 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind {
 ; X86-LABEL: func8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:    addb %dl, %ah
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    addb %dl, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addb %cl, %dl
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb $-128, %dl
+; X86-NEXT:    addb %cl, %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    cmovnol %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func8:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %sil, %al
-; X64-NEXT:    setns %cl
-; X64-NEXT:    addl $127, %ecx
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    sarb $7, %al
+; X64-NEXT:    xorb $-128, %al
 ; X64-NEXT:    addb %sil, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    cmovol %ecx, %eax
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    cmovnol %ecx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp = call i8 @llvm.sadd.sat.i8(i8 %x, i8 %y)
@@ -176,72 +165,59 @@ define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind {
 define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-LABEL: vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%ecx,%eax), %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    setns %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmovol %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    setns %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovol %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    setns %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    cmovol %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%edx,%eax), %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovol %eax, %esi
+; X86-NEXT:    cmovol %esi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    setns %bl
-; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-NEXT:    leal (%edi,%eax), %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
 ; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmovol %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%ebx,%eax), %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovol %ebx, %edi
+; X86-NEXT:    cmovol %esi, %ebx
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: vec:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pxor %xmm2, %xmm2
-; X64-NEXT:    pxor %xmm3, %xmm3
-; X64-NEXT:    pcmpgtd %xmm1, %xmm3
+; X64-NEXT:    pcmpgtd %xmm1, %xmm2
 ; X64-NEXT:    paddd %xmm0, %xmm1
 ; X64-NEXT:    pcmpgtd %xmm1, %xmm0
-; X64-NEXT:    pxor %xmm3, %xmm0
-; X64-NEXT:    movdqa %xmm1, %xmm3
-; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-NEXT:    pcmpgtd %xmm1, %xmm2
-; X64-NEXT:    psrld $1, %xmm2
-; X64-NEXT:    por %xmm3, %xmm2
-; X64-NEXT:    pand %xmm0, %xmm2
-; X64-NEXT:    pandn %xmm1, %xmm0
+; X64-NEXT:    pxor %xmm2, %xmm0
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    pandn %xmm1, %xmm2
+; X64-NEXT:    psrad $31, %xmm1
+; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    pand %xmm1, %xmm0
 ; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
   %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);

diff  --git a/llvm/test/CodeGen/X86/sadd_sat_plus.ll b/llvm/test/CodeGen/X86/sadd_sat_plus.ll
index e50f6d5b6c55..6deab30470b4 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_plus.ll
@@ -11,28 +11,24 @@ declare i64 @llvm.sadd.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-LABEL: func32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    cmovol %ecx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%eax,%ecx), %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    cmovol %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func32:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    imull %edx, %esi
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    addl %esi, %ecx
-; X64-NEXT:    setns %al
-; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    sarl $31, %eax
+; X64-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
 ; X64-NEXT:    addl %edi, %esi
 ; X64-NEXT:    cmovnol %esi, %eax
 ; X64-NEXT:    retq
@@ -45,34 +41,27 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-LABEL: func64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seto %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setns %dl
-; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    popl %esi
+; X86-NEXT:    cmovel %ecx, %edx
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    setns %cl
-; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    leaq (%rdi,%rdx), %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-NEXT:    xorq %rcx, %rax
 ; X64-NEXT:    addq %rdx, %rdi
 ; X64-NEXT:    cmovnoq %rdi, %rax
 ; X64-NEXT:    retq
@@ -84,31 +73,30 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 define signext i16 @func16(i16 signext %x, i16 signext %y, i16 signext %z) nounwind {
 ; X86-LABEL: func16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imulw {{[0-9]+}}(%esp), %ax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addw %dx, %si
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    addw %dx, %ax
-; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addw %cx, %dx
+; X86-NEXT:    movswl %dx, %edx
+; X86-NEXT:    sarl $15, %edx
+; X86-NEXT:    xorl $-32768, %edx # imm = 0x8000
+; X86-NEXT:    addw %cx, %ax
+; X86-NEXT:    cmovol %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func16:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    imull %edx, %esi
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    addw %si, %cx
-; X64-NEXT:    setns %al
-; X64-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X64-NEXT:    addw %si, %di
-; X64-NEXT:    cmovnol %edi, %eax
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    cwtl
+; X64-NEXT:    sarl $15, %eax
+; X64-NEXT:    xorl $-32768, %eax # imm = 0x8000
+; X64-NEXT:    addw %di, %si
+; X64-NEXT:    cmovnol %esi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %a = mul i16 %y, %z
@@ -121,31 +109,32 @@ define signext i8 @func8(i8 signext %x, i8 signext %y, i8 signext %z) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    mulb {{[0-9]+}}(%esp)
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:    addb %dl, %ah
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    addb %dl, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addb %cl, %dl
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb $-128, %dl
+; X86-NEXT:    addb %cl, %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    cmovnol %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    mulb %dl
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    addb %al, %dl
-; X64-NEXT:    setns %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    addb %al, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    cmovol %ecx, %eax
+; X64-NEXT:    # kill: def $al killed $al def $rax
+; X64-NEXT:    leal (%rdi,%rax), %ecx
+; X64-NEXT:    sarb $7, %cl
+; X64-NEXT:    xorb $-128, %cl
+; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    movzbl %al, %edx
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    cmovnol %edx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %a = mul i8 %y, %z

diff  --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index cc1d94c9c487..334c7fba0e55 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -429,32 +429,30 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; SSE-LABEL: v1i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movb (%rdi), %cl
-; SSE-NEXT:    movb (%rsi), %dil
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    movl %ecx, %eax
-; SSE-NEXT:    addb %dil, %al
-; SSE-NEXT:    setns %sil
-; SSE-NEXT:    addl $127, %esi
-; SSE-NEXT:    addb %dil, %cl
+; SSE-NEXT:    movb (%rdi), %al
+; SSE-NEXT:    movb (%rsi), %cl
+; SSE-NEXT:    leal (%rax,%rcx), %esi
+; SSE-NEXT:    sarb $7, %sil
+; SSE-NEXT:    xorb $-128, %sil
+; SSE-NEXT:    addb %al, %cl
 ; SSE-NEXT:    movzbl %cl, %eax
-; SSE-NEXT:    cmovol %esi, %eax
-; SSE-NEXT:    movb %al, (%rdx)
+; SSE-NEXT:    movzbl %sil, %ecx
+; SSE-NEXT:    cmovnol %eax, %ecx
+; SSE-NEXT:    movb %cl, (%rdx)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: v1i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movb (%rdi), %cl
-; AVX-NEXT:    movb (%rsi), %dil
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    movl %ecx, %eax
-; AVX-NEXT:    addb %dil, %al
-; AVX-NEXT:    setns %sil
-; AVX-NEXT:    addl $127, %esi
-; AVX-NEXT:    addb %dil, %cl
+; AVX-NEXT:    movb (%rdi), %al
+; AVX-NEXT:    movb (%rsi), %cl
+; AVX-NEXT:    leal (%rax,%rcx), %esi
+; AVX-NEXT:    sarb $7, %sil
+; AVX-NEXT:    xorb $-128, %sil
+; AVX-NEXT:    addb %al, %cl
 ; AVX-NEXT:    movzbl %cl, %eax
-; AVX-NEXT:    cmovol %esi, %eax
-; AVX-NEXT:    movb %al, (%rdx)
+; AVX-NEXT:    movzbl %sil, %ecx
+; AVX-NEXT:    cmovnol %eax, %ecx
+; AVX-NEXT:    movb %cl, (%rdx)
 ; AVX-NEXT:    retq
   %x = load <1 x i8>, <1 x i8>* %px
   %y = load <1 x i8>, <1 x i8>* %py
@@ -468,28 +466,26 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movzwl (%rdi), %eax
 ; SSE-NEXT:    movzwl (%rsi), %ecx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    movl %eax, %edi
-; SSE-NEXT:    addw %cx, %di
-; SSE-NEXT:    setns %sil
-; SSE-NEXT:    addl $32767, %esi # imm = 0x7FFF
-; SSE-NEXT:    addw %cx, %ax
-; SSE-NEXT:    cmovol %esi, %eax
-; SSE-NEXT:    movw %ax, (%rdx)
+; SSE-NEXT:    leal (%rax,%rcx), %esi
+; SSE-NEXT:    movswl %si, %esi
+; SSE-NEXT:    sarl $15, %esi
+; SSE-NEXT:    xorl $-32768, %esi # imm = 0x8000
+; SSE-NEXT:    addw %ax, %cx
+; SSE-NEXT:    cmovol %esi, %ecx
+; SSE-NEXT:    movw %cx, (%rdx)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: v1i16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movzwl (%rdi), %eax
 ; AVX-NEXT:    movzwl (%rsi), %ecx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    movl %eax, %edi
-; AVX-NEXT:    addw %cx, %di
-; AVX-NEXT:    setns %sil
-; AVX-NEXT:    addl $32767, %esi # imm = 0x7FFF
-; AVX-NEXT:    addw %cx, %ax
-; AVX-NEXT:    cmovol %esi, %eax
-; AVX-NEXT:    movw %ax, (%rdx)
+; AVX-NEXT:    leal (%rax,%rcx), %esi
+; AVX-NEXT:    movswl %si, %esi
+; AVX-NEXT:    sarl $15, %esi
+; AVX-NEXT:    xorl $-32768, %esi # imm = 0x8000
+; AVX-NEXT:    addw %ax, %cx
+; AVX-NEXT:    cmovol %esi, %ecx
+; AVX-NEXT:    movw %cx, (%rdx)
 ; AVX-NEXT:    retq
   %x = load <1 x i16>, <1 x i16>* %px
   %y = load <1 x i16>, <1 x i16>* %py
@@ -598,84 +594,76 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-LABEL: v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    paddd %xmm0, %xmm1
 ; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm2
-; SSSE3-NEXT:    por %xmm3, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm1, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    paddd %xmm1, %xmm3
-; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm3
-; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v2i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
 ; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v2i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
 ; AVX512F-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v2i32:
@@ -685,10 +673,8 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@@ -699,84 +685,76 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-LABEL: v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    paddd %xmm0, %xmm1
 ; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm2
-; SSSE3-NEXT:    por %xmm3, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm1, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    paddd %xmm1, %xmm3
-; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm3
-; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
 ; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v4i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
 ; AVX512F-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v4i32:
@@ -786,10 +764,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@@ -805,29 +781,23 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm5, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pandn %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    por %xmm6, %xmm7
-; SSE2-NEXT:    pand %xmm0, %xmm7
-; SSE2-NEXT:    pandn %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm7, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    paddd %xmm1, %xmm3
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pandn %xmm5, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pandn %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i32:
@@ -838,55 +808,47 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSSE3-NEXT:    paddd %xmm0, %xmm2
 ; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pandn %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSSE3-NEXT:    psrld $1, %xmm7
-; SSSE3-NEXT:    por %xmm6, %xmm7
-; SSSE3-NEXT:    pand %xmm0, %xmm7
-; SSSE3-NEXT:    pandn %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm7, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pandn %xmm2, %xmm5
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm6, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSSE3-NEXT:    paddd %xmm1, %xmm3
 ; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSSE3-NEXT:    pandn %xmm5, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm2, %xmm4
-; SSSE3-NEXT:    pand %xmm1, %xmm4
-; SSSE3-NEXT:    pandn %xmm3, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pandn %xmm3, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    paddd %xmm2, %xmm5
-; SSE41-NEXT:    movaps {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm6, %xmm7
+; SSE41-NEXT:    paddd %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    paddd %xmm3, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm8, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pxor %xmm2, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm5
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    paddd %xmm3, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm8, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm6, %xmm2
-; SSE41-NEXT:    movaps %xmm5, %xmm0
-; SSE41-NEXT:    movaps %xmm2, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movaps %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i32:
@@ -896,35 +858,37 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm5
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm5, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vblendvps %ymm0, %ymm6, %ymm5, %ymm0
+; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm5, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm2, %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvps %ymm0, %ymm3, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v8i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %ymm2, %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vblendvps %ymm0, %ymm3, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsrad $31, %ymm2, %ymm1
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v8i32:
@@ -934,10 +898,8 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %ymm1, %ymm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
   %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
@@ -953,57 +915,45 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSE2-NEXT:    paddd %xmm0, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    pandn %xmm4, %xmm10
+; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm4, %xmm10
-; SSE2-NEXT:    pandn %xmm9, %xmm10
-; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
-; SSE2-NEXT:    psrld $1, %xmm11
-; SSE2-NEXT:    por %xmm10, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pandn %xmm4, %xmm0
-; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pxor %xmm9, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm10, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
 ; SSE2-NEXT:    paddd %xmm1, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, %xmm10
-; SSE2-NEXT:    pandn %xmm9, %xmm10
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm10, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pandn %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
 ; SSE2-NEXT:    paddd %xmm2, %xmm6
 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    pandn %xmm9, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm6, %xmm2
-; SSE2-NEXT:    por %xmm5, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm6
+; SSE2-NEXT:    pxor %xmm9, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
 ; SSE2-NEXT:    paddd %xmm3, %xmm7
 ; SSE2-NEXT:    pcmpgtd %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm7, %xmm4
-; SSE2-NEXT:    pandn %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    por %xmm4, %xmm8
-; SSE2-NEXT:    pand %xmm3, %xmm8
-; SSE2-NEXT:    pandn %xmm7, %xmm3
-; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm7, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm7
+; SSE2-NEXT:    pxor %xmm7, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v16i32:
@@ -1014,105 +964,89 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSSE3-NEXT:    paddd %xmm0, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
 ; SSSE3-NEXT:    pxor %xmm9, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm10
+; SSSE3-NEXT:    pandn %xmm4, %xmm10
+; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm10
-; SSSE3-NEXT:    pandn %xmm9, %xmm10
-; SSSE3-NEXT:    pxor %xmm11, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm11
-; SSSE3-NEXT:    psrld $1, %xmm11
-; SSSE3-NEXT:    por %xmm10, %xmm11
-; SSSE3-NEXT:    pand %xmm0, %xmm11
-; SSSE3-NEXT:    pandn %xmm4, %xmm0
-; SSSE3-NEXT:    por %xmm11, %xmm0
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm10, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
 ; SSSE3-NEXT:    paddd %xmm1, %xmm5
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
 ; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm5, %xmm10
-; SSSE3-NEXT:    pandn %xmm9, %xmm10
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm10, %xmm4
-; SSSE3-NEXT:    pand %xmm1, %xmm4
-; SSSE3-NEXT:    pandn %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pandn %xmm5, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
 ; SSSE3-NEXT:    por %xmm4, %xmm1
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
 ; SSSE3-NEXT:    paddd %xmm2, %xmm6
 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm6, %xmm4
-; SSSE3-NEXT:    pandn %xmm9, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT:    psrld $1, %xmm5
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm2, %xmm5
-; SSSE3-NEXT:    pandn %xmm6, %xmm2
-; SSSE3-NEXT:    por %xmm5, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pandn %xmm6, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm6
+; SSSE3-NEXT:    pxor %xmm9, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    por %xmm4, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm8
 ; SSSE3-NEXT:    paddd %xmm3, %xmm7
 ; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm7, %xmm4
-; SSSE3-NEXT:    pandn %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm8
-; SSSE3-NEXT:    psrld $1, %xmm8
-; SSSE3-NEXT:    por %xmm4, %xmm8
-; SSSE3-NEXT:    pand %xmm3, %xmm8
-; SSSE3-NEXT:    pandn %xmm7, %xmm3
-; SSSE3-NEXT:    por %xmm8, %xmm3
+; SSSE3-NEXT:    pxor %xmm8, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pandn %xmm7, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm7
+; SSSE3-NEXT:    pxor %xmm7, %xmm9
+; SSSE3-NEXT:    pand %xmm9, %xmm3
+; SSSE3-NEXT:    por %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v16i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm3, %xmm8
-; SSE41-NEXT:    movdqa %xmm2, %xmm12
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm9
 ; SSE41-NEXT:    paddd %xmm4, %xmm9
-; SSE41-NEXT:    movaps {{.*#+}} xmm11 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm10, %xmm2
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm9
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    paddd %xmm5, %xmm4
-; SSE41-NEXT:    movaps %xmm10, %xmm2
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT:    pxor %xmm5, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm4
-; SSE41-NEXT:    movdqa %xmm12, %xmm3
-; SSE41-NEXT:    paddd %xmm6, %xmm3
-; SSE41-NEXT:    movaps %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    paddd %xmm5, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm12
-; SSE41-NEXT:    pxor %xmm6, %xmm12
-; SSE41-NEXT:    movdqa %xmm12, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm8, %xmm5
-; SSE41-NEXT:    paddd %xmm7, %xmm5
-; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm10
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm10, %xmm2
+; SSE41-NEXT:    paddd %xmm6, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm10
+; SSE41-NEXT:    pxor %xmm6, %xmm10
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm10, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
+; SSE41-NEXT:    paddd %xmm7, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm8
 ; SSE41-NEXT:    pxor %xmm7, %xmm8
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    psrad $31, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm10, %xmm5
+; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm3
 ; SSE41-NEXT:    movaps %xmm9, %xmm0
-; SSE41-NEXT:    movaps %xmm4, %xmm1
-; SSE41-NEXT:    movaps %xmm3, %xmm2
-; SSE41-NEXT:    movaps %xmm5, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v16i32:
@@ -1122,41 +1056,47 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm7
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm9 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm7, %ymm8, %ymm9, %ymm10
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm5
 ; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vblendvps %ymm0, %ymm10, %ymm7, %ymm0
+; AVX1-NEXT:    vpsrad $31, %xmm6, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm7, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm6
-; AVX1-NEXT:    vblendvps %ymm6, %ymm8, %ymm9, %ymm7
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm7
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT:    vblendvps %ymm1, %ymm7, %ymm6, %ymm1
+; AVX1-NEXT:    vpsrad $31, %xmm6, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm7, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v16i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm4
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm4, %ymm5, %ymm6, %ymm7
 ; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vblendvps %ymm0, %ymm7, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm4, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm4, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm2
-; AVX2-NEXT:    vblendvps %ymm2, %ymm5, %ymm6, %ymm4
 ; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vblendvps %ymm1, %ymm4, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm3
+; AVX2-NEXT:    vpxor %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vblendvps %ymm1, %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v16i32:
@@ -1166,10 +1106,8 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vpsrad $31, %zmm1, %zmm0
+; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
@@ -1194,17 +1132,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1225,17 +1160,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSSE3-NEXT:    por %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -1331,42 +1263,35 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm7, %xmm7
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
 ; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm7, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm8, %xmm0
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm9, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
 ; SSE2-NEXT:    pand %xmm7, %xmm0
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
 ; SSE2-NEXT:    paddq %xmm3, %xmm1
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm8, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pand %xmm9, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
@@ -1391,42 +1316,35 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm7, %xmm7
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
 ; SSSE3-NEXT:    pxor %xmm6, %xmm7
-; SSSE3-NEXT:    movdqa %xmm7, %xmm5
-; SSSE3-NEXT:    pandn %xmm0, %xmm5
+; SSSE3-NEXT:    movdqa %xmm7, %xmm6
+; SSSE3-NEXT:    pandn %xmm0, %xmm6
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm8, %xmm0
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm6
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm9, %xmm6
-; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
 ; SSSE3-NEXT:    pand %xmm7, %xmm0
-; SSSE3-NEXT:    por %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    pxor %xmm4, %xmm6
 ; SSSE3-NEXT:    paddq %xmm3, %xmm1
 ; SSSE3-NEXT:    pxor %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm6, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm4, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
 ; SSSE3-NEXT:    pandn %xmm1, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm8, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT:    pand %xmm9, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
 ; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
@@ -1477,35 +1395,39 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm5
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm5, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorpd %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm6, %ymm5, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm5, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm3, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v4i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512F-NEXT:    vblendvpd %ymm2, %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vblendvpd %ymm0, %ymm3, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsraq $63, %zmm2, %zmm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512F-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v4i64:
@@ -1515,10 +1437,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm2, %k2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512BW-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpsraq $63, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
@@ -1544,59 +1464,52 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    por %xmm9, %xmm10
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pxor %xmm13, %xmm13
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm13
-; SSE2-NEXT:    pxor %xmm10, %xmm13
-; SSE2-NEXT:    movdqa %xmm13, %xmm12
-; SSE2-NEXT:    pandn %xmm0, %xmm12
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm13, %xmm0
-; SSE2-NEXT:    por %xmm12, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm12
-; SSE2-NEXT:    pxor %xmm8, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
+; SSE2-NEXT:    pandn %xmm0, %xmm10
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm9, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
 ; SSE2-NEXT:    paddq %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm12, %xmm13
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSE2-NEXT:    movdqa %xmm10, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm14, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm12
+; SSE2-NEXT:    pand %xmm13, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm10
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm5, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm12, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm13
-; SSE2-NEXT:    pandn %xmm1, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm4
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm1
 ; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    por %xmm13, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm12
-; SSE2-NEXT:    pxor %xmm8, %xmm12
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
 ; SSE2-NEXT:    paddq %xmm6, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm12, %xmm4
+; SSE2-NEXT:    movdqa %xmm10, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm5
+; SSE2-NEXT:    pand %xmm12, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm5, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
@@ -1605,12 +1518,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm4, %xmm6
 ; SSE2-NEXT:    movdqa %xmm6, %xmm4
 ; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm5
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm2
 ; SSE2-NEXT:    pand %xmm6, %xmm2
 ; SSE2-NEXT:    por %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
@@ -1626,17 +1536,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm5, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm4
 ; SSE2-NEXT:    pandn %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm11
-; SSE2-NEXT:    pand %xmm10, %xmm11
-; SSE2-NEXT:    por %xmm11, %xmm3
-; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm3
+; SSE2-NEXT:    pand %xmm11, %xmm3
 ; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
@@ -1658,59 +1565,52 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    por %xmm9, %xmm10
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm11, %xmm11
-; SSSE3-NEXT:    pxor %xmm13, %xmm13
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm13
-; SSSE3-NEXT:    pxor %xmm10, %xmm13
-; SSSE3-NEXT:    movdqa %xmm13, %xmm12
-; SSSE3-NEXT:    pandn %xmm0, %xmm12
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm10, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm0
-; SSSE3-NEXT:    pand %xmm13, %xmm0
-; SSSE3-NEXT:    por %xmm12, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm12
-; SSSE3-NEXT:    pxor %xmm8, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm10
+; SSSE3-NEXT:    pandn %xmm0, %xmm10
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm9, %xmm0
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm10
+; SSSE3-NEXT:    pxor %xmm8, %xmm10
 ; SSSE3-NEXT:    paddq %xmm5, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    movdqa %xmm12, %xmm13
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSSE3-NEXT:    movdqa %xmm10, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm14, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm12
+; SSSE3-NEXT:    pand %xmm13, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm10
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm5, %xmm5
 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT:    pxor %xmm12, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm13
-; SSSE3-NEXT:    pandn %xmm1, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm1
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm4
-; SSSE3-NEXT:    pand %xmm10, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pandn %xmm1, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
 ; SSSE3-NEXT:    pand %xmm5, %xmm1
-; SSSE3-NEXT:    por %xmm13, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm12
-; SSSE3-NEXT:    pxor %xmm8, %xmm12
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm10
+; SSSE3-NEXT:    pxor %xmm8, %xmm10
 ; SSSE3-NEXT:    paddq %xmm6, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    movdqa %xmm12, %xmm4
+; SSSE3-NEXT:    movdqa %xmm10, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm5
+; SSSE3-NEXT:    pand %xmm12, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm5, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
@@ -1719,12 +1619,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm4, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm4
 ; SSSE3-NEXT:    pandn %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm5
-; SSSE3-NEXT:    pand %xmm10, %xmm5
-; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm2
 ; SSSE3-NEXT:    pand %xmm6, %xmm2
 ; SSSE3-NEXT:    por %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
@@ -1740,17 +1637,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm4, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    movdqa %xmm6, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSSE3-NEXT:    pxor %xmm5, %xmm11
+; SSSE3-NEXT:    movdqa %xmm11, %xmm4
 ; SSSE3-NEXT:    pandn %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm11
-; SSSE3-NEXT:    pand %xmm10, %xmm11
-; SSSE3-NEXT:    por %xmm11, %xmm3
-; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm3
+; SSSE3-NEXT:    pand %xmm11, %xmm3
 ; SSSE3-NEXT:    por %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
@@ -1835,41 +1729,49 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm7
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm8 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm9 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm7, %ymm8, %ymm9, %ymm10
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorpd %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm10, %ymm7, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm6
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm8, %ymm9, %ymm7
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorpd %ymm5, %ymm4, %ymm4
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm4, %ymm7, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm8
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorpd %ymm1, %ymm3, %ymm1
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm7, %ymm6, %ymm1
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vxorpd %ymm5, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm8, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm4
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm6, %ymm7
 ; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm7, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm6, %ymm4
-; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm5
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm4, %ymm0
+; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm4, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm4, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v8i64:
@@ -1879,10 +1781,8 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vpsraq $63, %zmm1, %zmm0
+; AVX512-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
@@ -1900,31 +1800,25 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; SSE-NEXT:    movq %r8, %rbx
 ; SSE-NEXT:    sarq $63, %rbx
 ; SSE-NEXT:    testb %r10b, %r10b
-; SSE-NEXT:    cmoveq %rcx, %rbx
-; SSE-NEXT:    xorl %ecx, %ecx
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    setns %cl
-; SSE-NEXT:    movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
-; SSE-NEXT:    addq %r11, %rcx
+; SSE-NEXT:    cmovneq %rbx, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %r11, %rbx
 ; SSE-NEXT:    testb %r10b, %r10b
-; SSE-NEXT:    cmoveq %r8, %rcx
+; SSE-NEXT:    cmoveq %r8, %rbx
 ; SSE-NEXT:    addq %r9, %rsi
 ; SSE-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    seto %r8b
 ; SSE-NEXT:    movq %rdx, %rdi
 ; SSE-NEXT:    sarq $63, %rdi
 ; SSE-NEXT:    testb %r8b, %r8b
-; SSE-NEXT:    cmoveq %rsi, %rdi
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    setns %sil
-; SSE-NEXT:    addq %r11, %rsi
+; SSE-NEXT:    cmovneq %rdi, %rsi
+; SSE-NEXT:    xorq %r11, %rdi
 ; SSE-NEXT:    testb %r8b, %r8b
-; SSE-NEXT:    cmoveq %rdx, %rsi
-; SSE-NEXT:    movq %rbx, 16(%rax)
-; SSE-NEXT:    movq %rdi, (%rax)
-; SSE-NEXT:    movq %rcx, 24(%rax)
-; SSE-NEXT:    movq %rsi, 8(%rax)
+; SSE-NEXT:    cmoveq %rdx, %rdi
+; SSE-NEXT:    movq %rcx, 16(%rax)
+; SSE-NEXT:    movq %rsi, (%rax)
+; SSE-NEXT:    movq %rbx, 24(%rax)
+; SSE-NEXT:    movq %rdi, 8(%rax)
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    retq
 ;
@@ -1938,31 +1832,25 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; AVX-NEXT:    movq %r8, %rbx
 ; AVX-NEXT:    sarq $63, %rbx
 ; AVX-NEXT:    testb %r10b, %r10b
-; AVX-NEXT:    cmoveq %rcx, %rbx
-; AVX-NEXT:    xorl %ecx, %ecx
-; AVX-NEXT:    testq %r8, %r8
-; AVX-NEXT:    setns %cl
-; AVX-NEXT:    movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
-; AVX-NEXT:    addq %r11, %rcx
+; AVX-NEXT:    cmovneq %rbx, %rcx
+; AVX-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; AVX-NEXT:    xorq %r11, %rbx
 ; AVX-NEXT:    testb %r10b, %r10b
-; AVX-NEXT:    cmoveq %r8, %rcx
+; AVX-NEXT:    cmoveq %r8, %rbx
 ; AVX-NEXT:    addq %r9, %rsi
 ; AVX-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; AVX-NEXT:    seto %r8b
 ; AVX-NEXT:    movq %rdx, %rdi
 ; AVX-NEXT:    sarq $63, %rdi
 ; AVX-NEXT:    testb %r8b, %r8b
-; AVX-NEXT:    cmoveq %rsi, %rdi
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    testq %rdx, %rdx
-; AVX-NEXT:    setns %sil
-; AVX-NEXT:    addq %r11, %rsi
+; AVX-NEXT:    cmovneq %rdi, %rsi
+; AVX-NEXT:    xorq %r11, %rdi
 ; AVX-NEXT:    testb %r8b, %r8b
-; AVX-NEXT:    cmoveq %rdx, %rsi
-; AVX-NEXT:    movq %rbx, 16(%rax)
-; AVX-NEXT:    movq %rdi, (%rax)
-; AVX-NEXT:    movq %rcx, 24(%rax)
-; AVX-NEXT:    movq %rsi, 8(%rax)
+; AVX-NEXT:    cmoveq %rdx, %rdi
+; AVX-NEXT:    movq %rcx, 16(%rax)
+; AVX-NEXT:    movq %rsi, (%rax)
+; AVX-NEXT:    movq %rbx, 24(%rax)
+; AVX-NEXT:    movq %rdi, 8(%rax)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    retq
   %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)

diff  --git a/llvm/test/CodeGen/X86/ssub_sat.ll b/llvm/test/CodeGen/X86/ssub_sat.ll
index bdb45877c913..7199d326738c 100644
--- a/llvm/test/CodeGen/X86/ssub_sat.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat.ll
@@ -39,23 +39,18 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-LABEL: func2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seto %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setns %dl
-; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    popl %esi
+; X86-NEXT:    cmovel %ecx, %edx
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
@@ -212,20 +207,19 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X64-LABEL: vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    pxor %xmm2, %xmm2
-; X64-NEXT:    movdqa %xmm0, %xmm3
-; X64-NEXT:    psubd %xmm1, %xmm3
-; X64-NEXT:    pcmpgtd %xmm2, %xmm1
-; X64-NEXT:    pcmpgtd %xmm3, %xmm0
+; X64-NEXT:    pxor %xmm3, %xmm3
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    psubd %xmm1, %xmm2
+; X64-NEXT:    pcmpgtd %xmm3, %xmm1
+; X64-NEXT:    pcmpgtd %xmm2, %xmm0
 ; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    movdqa %xmm3, %xmm1
-; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    pcmpgtd %xmm3, %xmm2
-; X64-NEXT:    psrld $1, %xmm2
-; X64-NEXT:    por %xmm2, %xmm1
-; X64-NEXT:    pand %xmm0, %xmm1
-; X64-NEXT:    pandn %xmm3, %xmm0
-; X64-NEXT:    por %xmm1, %xmm0
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    pandn %xmm2, %xmm1
+; X64-NEXT:    psrad $31, %xmm2
+; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    por %xmm1, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm0
 ; X64-NEXT:    retq
   %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp

diff  --git a/llvm/test/CodeGen/X86/ssub_sat_plus.ll b/llvm/test/CodeGen/X86/ssub_sat_plus.ll
index dc0804994a77..51cd25c7885a 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_plus.ll
@@ -41,23 +41,18 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-LABEL: func64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seto %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setns %dl
-; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    popl %esi
+; X86-NEXT:    cmovel %ecx, %edx
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 0ae0cb5b557e..66da1cb172db 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -612,55 +612,51 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-LABEL: v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pandn %xmm3, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v2i32:
@@ -670,8 +666,8 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -682,9 +678,9 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -695,9 +691,9 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX512F-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX512F-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -708,10 +704,8 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@@ -721,55 +715,51 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-LABEL: v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pandn %xmm3, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i32:
@@ -779,8 +769,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -791,9 +781,9 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -804,9 +794,9 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX512F-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX512F-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -817,10 +807,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@@ -830,97 +818,87 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm2, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    por %xmm7, %xmm6
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    pandn %xmm5, %xmm0
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    psubd %xmm3, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psubd %xmm3, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    pandn %xmm5, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pandn %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT:    psrld $1, %xmm7
-; SSSE3-NEXT:    por %xmm7, %xmm6
-; SSSE3-NEXT:    pand %xmm0, %xmm6
-; SSSE3-NEXT:    pandn %xmm5, %xmm0
-; SSSE3-NEXT:    por %xmm6, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm5
-; SSSE3-NEXT:    psubd %xmm3, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    psubd %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm2
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    psubd %xmm3, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSSE3-NEXT:    pxor %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm5, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
 ; SSSE3-NEXT:    pandn %xmm2, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
-; SSSE3-NEXT:    pandn %xmm5, %xmm1
-; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm6, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm6, %xmm6
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm8, %xmm8
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    psubd %xmm2, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pxor %xmm2, %xmm4
-; SSE41-NEXT:    movaps {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm6, %xmm2
+; SSE41-NEXT:    psubd %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    psubd %xmm3, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm2
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm5
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psubd %xmm3, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm6
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm6, %xmm2
-; SSE41-NEXT:    movaps %xmm5, %xmm0
-; SSE41-NEXT:    movaps %xmm2, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movaps %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i32:
@@ -937,10 +915,12 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm3, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i32:
@@ -950,9 +930,9 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -963,9 +943,9 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX512F-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsrad $31, %ymm1, %ymm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -976,10 +956,8 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %ymm1, %ymm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
   %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
@@ -989,176 +967,157 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSE2-LABEL: v16i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm8, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    psubd %xmm4, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm9, %xmm11
-; SSE2-NEXT:    pandn %xmm10, %xmm11
-; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    psubd %xmm4, %xmm0
 ; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pandn %xmm9, %xmm0
-; SSE2-NEXT:    por %xmm11, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NEXT:    psubd %xmm5, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NEXT:    pandn %xmm10, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm1, %xmm5
-; SSE2-NEXT:    pandn %xmm9, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    psubd %xmm5, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSE2-NEXT:    pxor %xmm5, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    psubd %xmm6, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm6
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm10, %xmm5
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm5
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psubd %xmm7, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm4, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    psubd %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
 ; SSE2-NEXT:    pxor %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm10, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    por %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm5, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm5
+; SSE2-NEXT:    pxor %xmm10, %xmm5
 ; SSE2-NEXT:    pand %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm4, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v16i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm8, %xmm8
-; SSSE3-NEXT:    movdqa %xmm0, %xmm9
-; SSSE3-NEXT:    psubd %xmm4, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm9, %xmm11
-; SSSE3-NEXT:    pandn %xmm10, %xmm11
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm11
-; SSSE3-NEXT:    pand %xmm0, %xmm11
-; SSSE3-NEXT:    pandn %xmm9, %xmm0
-; SSSE3-NEXT:    por %xmm11, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm9
-; SSSE3-NEXT:    psubd %xmm5, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT:    pxor %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm9, %xmm5
-; SSSE3-NEXT:    pandn %xmm10, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm8
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm9
+; SSSE3-NEXT:    psubd %xmm4, %xmm0
 ; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm1, %xmm5
-; SSSE3-NEXT:    pandn %xmm9, %xmm1
-; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pandn %xmm0, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm8, %xmm1
+; SSSE3-NEXT:    psubd %xmm5, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSSE3-NEXT:    pxor %xmm5, %xmm8
+; SSSE3-NEXT:    movdqa %xmm8, %xmm4
+; SSSE3-NEXT:    pandn %xmm1, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    pand %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
 ; SSSE3-NEXT:    psubd %xmm6, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm6
 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
 ; SSSE3-NEXT:    pxor %xmm6, %xmm2
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm10, %xmm5
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    psrld $1, %xmm6
-; SSSE3-NEXT:    por %xmm6, %xmm5
-; SSSE3-NEXT:    pand %xmm2, %xmm5
-; SSSE3-NEXT:    pandn %xmm4, %xmm2
-; SSSE3-NEXT:    por %xmm5, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    psubd %xmm7, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pandn %xmm4, %xmm5
+; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSSE3-NEXT:    por %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; SSSE3-NEXT:    psubd %xmm7, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm3
 ; SSSE3-NEXT:    pxor %xmm7, %xmm3
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm10, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm8
-; SSSE3-NEXT:    psrld $1, %xmm8
-; SSSE3-NEXT:    por %xmm8, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm5, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm5
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
 ; SSSE3-NEXT:    pand %xmm3, %xmm5
-; SSSE3-NEXT:    pandn %xmm4, %xmm3
-; SSSE3-NEXT:    por %xmm5, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm5, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v16i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm3, %xmm8
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm10, %xmm10
+; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm11, %xmm11
 ; SSE41-NEXT:    movdqa %xmm0, %xmm9
 ; SSE41-NEXT:    psubd %xmm4, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    movaps {{.*#+}} xmm12 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm11, %xmm4
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    psubd %xmm5, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm9
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psubd %xmm5, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT:    pxor %xmm5, %xmm1
-; SSE41-NEXT:    movaps %xmm11, %xmm3
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm10, %xmm2
+; SSE41-NEXT:    psubd %xmm6, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm10
+; SSE41-NEXT:    pxor %xmm6, %xmm10
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubd %xmm6, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm6, %xmm2
-; SSE41-NEXT:    movaps %xmm11, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm8, %xmm5
-; SSE41-NEXT:    psubd %xmm7, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm10, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
+; SSE41-NEXT:    psubd %xmm7, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm8
 ; SSE41-NEXT:    pxor %xmm7, %xmm8
-; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm11
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    psrad $31, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm5
+; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm3
 ; SSE41-NEXT:    movaps %xmm9, %xmm0
-; SSE41-NEXT:    movaps %xmm4, %xmm1
-; SSE41-NEXT:    movaps %xmm3, %xmm2
-; SSE41-NEXT:    movaps %xmm5, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v16i32:
@@ -1175,25 +1134,30 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm0, %ymm6, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm6
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm2, %ymm4, %ymm6, %ymm7
-; AVX1-NEXT:    vblendvps %ymm0, %ymm7, %ymm2, %ymm0
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm6, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm6
 ; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm3, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpsubd %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm7, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpsubd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm6, %xmm6
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm5, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm5
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vblendvps %ymm2, %ymm4, %ymm6, %ymm3
-; AVX1-NEXT:    vblendvps %ymm1, %ymm3, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm5, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v16i32:
@@ -1203,15 +1167,16 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm2, %ymm5, %ymm6, %ymm7
-; AVX2-NEXT:    vblendvps %ymm0, %ymm7, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm5
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vblendvps %ymm0, %ymm5, %ymm2, %ymm0
 ; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
 ; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vblendvps %ymm3, %ymm5, %ymm6, %ymm2
+; AVX2-NEXT:    vpsrad $31, %ymm3, %ymm2
+; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm1
 ; AVX2-NEXT:    retq
 ;
@@ -1222,10 +1187,8 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vpsrad $31, %zmm1, %zmm0
+; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
@@ -1261,12 +1224,9 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -1299,12 +1259,9 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
@@ -1417,34 +1374,29 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm6, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm7
-; SSE2-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
 ; SSE2-NEXT:    psubq %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
@@ -1458,11 +1410,9 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm5, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pand %xmm10, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
@@ -1490,34 +1440,29 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm6, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm7
-; SSSE3-NEXT:    pxor %xmm4, %xmm7
-; SSSE3-NEXT:    movdqa %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
 ; SSSE3-NEXT:    pandn %xmm0, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm10, %xmm5
-; SSSE3-NEXT:    por %xmm5, %xmm0
-; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm5, %xmm0
 ; SSSE3-NEXT:    por %xmm4, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
 ; SSSE3-NEXT:    psubq %xmm3, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    movdqa %xmm4, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm4, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
@@ -1531,11 +1476,9 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm5, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
 ; SSSE3-NEXT:    pandn %xmm1, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT:    pand %xmm10, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSSE3-NEXT:    pand %xmm4, %xmm1
 ; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
@@ -1600,31 +1543,33 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm5, %xmm5
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vxorpd %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vxorpd %ymm0, %ymm4, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm4, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -1635,9 +1580,9 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX512F-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512F-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsraq $63, %zmm1, %zmm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512F-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1648,10 +1593,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm2, %k2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512BW-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpsraq $63, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
@@ -1682,86 +1625,75 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm11, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm12, %xmm13
-; SSE2-NEXT:    pxor %xmm10, %xmm13
-; SSE2-NEXT:    movdqa %xmm13, %xmm12
-; SSE2-NEXT:    pandn %xmm0, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
+; SSE2-NEXT:    pandn %xmm0, %xmm10
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm0
-; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm13, %xmm0
-; SSE2-NEXT:    por %xmm12, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm12
-; SSE2-NEXT:    pxor %xmm8, %xmm12
+; SSE2-NEXT:    pxor %xmm9, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
 ; SSE2-NEXT:    psubq %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm12, %xmm13
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm14, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm12
+; SSE2-NEXT:    pand %xmm12, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm10
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm5
+; SSE2-NEXT:    pand %xmm11, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pxor %xmm12, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm13
-; SSE2-NEXT:    pandn %xmm1, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm5
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    por %xmm13, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm12
-; SSE2-NEXT:    pxor %xmm8, %xmm12
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
 ; SSE2-NEXT:    psubq %xmm6, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm12, %xmm4
+; SSE2-NEXT:    movdqa %xmm10, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm5
+; SSE2-NEXT:    pand %xmm11, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm5, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm6
 ; SSE2-NEXT:    movdqa %xmm6, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm6
+; SSE2-NEXT:    pand %xmm10, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm6, %xmm5
 ; SSE2-NEXT:    pxor %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, %xmm4
 ; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm6
-; SSE2-NEXT:    pand %xmm10, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm2
 ; SSE2-NEXT:    pand %xmm5, %xmm2
 ; SSE2-NEXT:    por %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
@@ -1771,10 +1703,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
 ; SSE2-NEXT:    movdqa %xmm4, %xmm6
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm4
+; SSE2-NEXT:    pand %xmm10, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm7
@@ -1789,11 +1721,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm5, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; SSE2-NEXT:    pandn %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm11
-; SSE2-NEXT:    pand %xmm10, %xmm11
-; SSE2-NEXT:    por %xmm11, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm3
 ; SSE2-NEXT:    pand %xmm4, %xmm3
 ; SSE2-NEXT:    por %xmm5, %xmm3
 ; SSE2-NEXT:    retq
@@ -1821,86 +1751,75 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm11, %xmm12
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm12, %xmm13
-; SSSE3-NEXT:    pxor %xmm10, %xmm13
-; SSSE3-NEXT:    movdqa %xmm13, %xmm12
-; SSSE3-NEXT:    pandn %xmm0, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSSE3-NEXT:    por %xmm12, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm10
+; SSSE3-NEXT:    pandn %xmm0, %xmm10
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm0
-; SSSE3-NEXT:    pxor %xmm11, %xmm11
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm10, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm0
-; SSSE3-NEXT:    pand %xmm13, %xmm0
-; SSSE3-NEXT:    por %xmm12, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm12
-; SSSE3-NEXT:    pxor %xmm8, %xmm12
+; SSSE3-NEXT:    pxor %xmm9, %xmm0
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm10
+; SSSE3-NEXT:    pxor %xmm8, %xmm10
 ; SSSE3-NEXT:    psubq %xmm5, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    movdqa %xmm12, %xmm13
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSSE3-NEXT:    movdqa %xmm10, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm14, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm12
+; SSSE3-NEXT:    pand %xmm12, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm10
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm5
+; SSSE3-NEXT:    pand %xmm11, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pxor %xmm12, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm13
-; SSSE3-NEXT:    pandn %xmm1, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm1
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm5
-; SSSE3-NEXT:    pand %xmm10, %xmm5
-; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pandn %xmm1, %xmm5
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
 ; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    por %xmm13, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm12
-; SSSE3-NEXT:    pxor %xmm8, %xmm12
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm10
+; SSSE3-NEXT:    pxor %xmm8, %xmm10
 ; SSSE3-NEXT:    psubq %xmm6, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    movdqa %xmm12, %xmm4
+; SSSE3-NEXT:    movdqa %xmm10, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm5
+; SSSE3-NEXT:    pand %xmm11, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm5, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm5
 ; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm6
+; SSSE3-NEXT:    pand %xmm10, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm6, %xmm5
 ; SSSE3-NEXT:    pxor %xmm4, %xmm5
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm4
 ; SSSE3-NEXT:    pandn %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm6
-; SSSE3-NEXT:    pand %xmm10, %xmm6
-; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm2
 ; SSSE3-NEXT:    pand %xmm5, %xmm2
 ; SSSE3-NEXT:    por %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
@@ -1910,10 +1829,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm6
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm4
+; SSSE3-NEXT:    pand %xmm10, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm4, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm7
@@ -1928,11 +1847,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm5, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
 ; SSSE3-NEXT:    pandn %xmm3, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm11
-; SSSE3-NEXT:    pand %xmm10, %xmm11
-; SSSE3-NEXT:    por %xmm11, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm3
 ; SSSE3-NEXT:    pand %xmm4, %xmm3
 ; SSSE3-NEXT:    por %xmm5, %xmm3
 ; SSSE3-NEXT:    retq
@@ -2042,37 +1959,42 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ;
 ; AVX1-LABEL: v8i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm4, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm7
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpsubq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorpd %ymm0, %ymm6, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm4 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm4, %ymm6, %ymm7
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm7, %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorpd %ymm5, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm6, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm2, %xmm7
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
 ; AVX1-NEXT:    vpsubq %xmm2, %xmm7, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm7
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX1-NEXT:    vxorpd %ymm1, %ymm5, %ymm1
+; AVX1-NEXT:    vxorpd %ymm1, %ymm6, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm6
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm4, %ymm6, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
+; AVX1-NEXT:    vxorpd %ymm5, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm6, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i64:
@@ -2082,15 +2004,16 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm6, %ymm7
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm7, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm5
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm2, %ymm0
 ; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm3
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm6, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm2
+; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
 ; AVX2-NEXT:    retq
 ;
@@ -2101,10 +2024,8 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vpsraq $63, %zmm1, %zmm0
+; AVX512-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
@@ -2122,31 +2043,25 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; SSE-NEXT:    movq %r8, %rbx
 ; SSE-NEXT:    sarq $63, %rbx
 ; SSE-NEXT:    testb %r10b, %r10b
-; SSE-NEXT:    cmoveq %rcx, %rbx
-; SSE-NEXT:    xorl %ecx, %ecx
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    setns %cl
-; SSE-NEXT:    movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
-; SSE-NEXT:    addq %r11, %rcx
+; SSE-NEXT:    cmovneq %rbx, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %r11, %rbx
 ; SSE-NEXT:    testb %r10b, %r10b
-; SSE-NEXT:    cmoveq %r8, %rcx
+; SSE-NEXT:    cmoveq %r8, %rbx
 ; SSE-NEXT:    subq %r9, %rsi
 ; SSE-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    seto %r8b
 ; SSE-NEXT:    movq %rdx, %rdi
 ; SSE-NEXT:    sarq $63, %rdi
 ; SSE-NEXT:    testb %r8b, %r8b
-; SSE-NEXT:    cmoveq %rsi, %rdi
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    setns %sil
-; SSE-NEXT:    addq %r11, %rsi
+; SSE-NEXT:    cmovneq %rdi, %rsi
+; SSE-NEXT:    xorq %r11, %rdi
 ; SSE-NEXT:    testb %r8b, %r8b
-; SSE-NEXT:    cmoveq %rdx, %rsi
-; SSE-NEXT:    movq %rbx, 16(%rax)
-; SSE-NEXT:    movq %rdi, (%rax)
-; SSE-NEXT:    movq %rcx, 24(%rax)
-; SSE-NEXT:    movq %rsi, 8(%rax)
+; SSE-NEXT:    cmoveq %rdx, %rdi
+; SSE-NEXT:    movq %rcx, 16(%rax)
+; SSE-NEXT:    movq %rsi, (%rax)
+; SSE-NEXT:    movq %rbx, 24(%rax)
+; SSE-NEXT:    movq %rdi, 8(%rax)
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    retq
 ;
@@ -2160,31 +2075,25 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; AVX-NEXT:    movq %r8, %rbx
 ; AVX-NEXT:    sarq $63, %rbx
 ; AVX-NEXT:    testb %r10b, %r10b
-; AVX-NEXT:    cmoveq %rcx, %rbx
-; AVX-NEXT:    xorl %ecx, %ecx
-; AVX-NEXT:    testq %r8, %r8
-; AVX-NEXT:    setns %cl
-; AVX-NEXT:    movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
-; AVX-NEXT:    addq %r11, %rcx
+; AVX-NEXT:    cmovneq %rbx, %rcx
+; AVX-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; AVX-NEXT:    xorq %r11, %rbx
 ; AVX-NEXT:    testb %r10b, %r10b
-; AVX-NEXT:    cmoveq %r8, %rcx
+; AVX-NEXT:    cmoveq %r8, %rbx
 ; AVX-NEXT:    subq %r9, %rsi
 ; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; AVX-NEXT:    seto %r8b
 ; AVX-NEXT:    movq %rdx, %rdi
 ; AVX-NEXT:    sarq $63, %rdi
 ; AVX-NEXT:    testb %r8b, %r8b
-; AVX-NEXT:    cmoveq %rsi, %rdi
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    testq %rdx, %rdx
-; AVX-NEXT:    setns %sil
-; AVX-NEXT:    addq %r11, %rsi
+; AVX-NEXT:    cmovneq %rdi, %rsi
+; AVX-NEXT:    xorq %r11, %rdi
 ; AVX-NEXT:    testb %r8b, %r8b
-; AVX-NEXT:    cmoveq %rdx, %rsi
-; AVX-NEXT:    movq %rbx, 16(%rax)
-; AVX-NEXT:    movq %rdi, (%rax)
-; AVX-NEXT:    movq %rcx, 24(%rax)
-; AVX-NEXT:    movq %rsi, 8(%rax)
+; AVX-NEXT:    cmoveq %rdx, %rdi
+; AVX-NEXT:    movq %rcx, 16(%rax)
+; AVX-NEXT:    movq %rsi, (%rax)
+; AVX-NEXT:    movq %rbx, 24(%rax)
+; AVX-NEXT:    movq %rdi, 8(%rax)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    retq
   %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)


        


More information about the llvm-commits mailing list