[llvm] [TargetLowering] Change subtraction to do (LHS < RHS) XOR (RESULT < 0) (PR #150872)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 28 09:04:27 PDT 2025


https://github.com/AZero13 updated https://github.com/llvm/llvm-project/pull/150872

>From 791565225a3415fcca0482ec9897c818594c04a5 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Mon, 28 Jul 2025 00:35:42 -0400
Subject: [PATCH] [TargetLowering] Change subtraction to do (LHS < RHS) XOR
 (RESULT < 0)

This folds better.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   31 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   32 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   29 +
 llvm/test/CodeGen/AMDGPU/ssubsat.ll           | 1010 +++++-----
 .../test/CodeGen/RISCV/arith-with-overflow.ll |    4 +-
 llvm/test/CodeGen/RISCV/ssub_sat.ll           |   14 +-
 llvm/test/CodeGen/RISCV/ssub_sat_plus.ll      |   18 +-
 llvm/test/CodeGen/RISCV/xaluo.ll              |  120 +-
 llvm/test/CodeGen/RISCV/xqcia.ll              |    7 +-
 .../CodeGen/Thumb2/mve-saturating-arith.ll    |   55 +-
 .../CodeGen/X86/expand-vp-int-intrinsics.ll   |   79 +-
 llvm/test/CodeGen/X86/ssub_sat.ll             |   24 +-
 llvm/test/CodeGen/X86/ssub_sat_vec.ll         | 1743 +++++++----------
 llvm/test/CodeGen/X86/vec_ssubo.ll            |  542 ++---
 14 files changed, 1755 insertions(+), 1953 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed7b07f7d9367..5830016dfd82b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8871,21 +8871,26 @@ LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
     MIRBuilder.buildSub(NewDst0, LHS, RHS);
 
   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
-
   auto Zero = MIRBuilder.buildConstant(Ty, 0);
 
-  // For an addition, the result should be less than one of the operands (LHS)
-  // if and only if the other operand (RHS) is negative, otherwise there will
-  // be overflow.
-  // For a subtraction, the result should be less than one of the operands
-  // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
-  // otherwise there will be overflow.
-  auto ResultLowerThanLHS =
-      MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS);
-  auto ConditionRHS = MIRBuilder.buildICmp(
-      IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
-
-  MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
+  if (IsAdd) {
+    // For addition, the result should be less than one of the operands (LHS)
+    // if and only if the other operand (RHS) is negative, otherwise there will
+    // be overflow.
+    auto ResultLowerThanLHS =
+        MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS);
+    auto RHSNegative =
+        MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, RHS, Zero);
+    MIRBuilder.buildXor(Dst1, RHSNegative, ResultLowerThanLHS);
+  } else {
+    // For subtraction, overflow occurs when the signed comparison of operands
+    // doesn't match the sign of the result
+    auto LHSLessThanRHS =
+        MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS, RHS);
+    auto ResultNegative =
+        MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, Zero);
+    MIRBuilder.buildXor(Dst1, LHSLessThanRHS, ResultNegative);
+  }
 
   MIRBuilder.buildCopy(Dst0, NewDst0);
   MI.eraseFromParent();
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1764910861df4..7d9c8e3865405 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11441,19 +11441,25 @@ void TargetLowering::expandSADDSUBO(
 
   SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
 
-  // For an addition, the result should be less than one of the operands (LHS)
-  // if and only if the other operand (RHS) is negative, otherwise there will
-  // be overflow.
-  // For a subtraction, the result should be less than one of the operands
-  // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
-  // otherwise there will be overflow.
-  SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
-  SDValue ConditionRHS =
-      DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT);
-
-  Overflow = DAG.getBoolExtOrTrunc(
-      DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
-      ResultType, ResultType);
+  if (IsAdd) {
+    // For addition, the result should be less than one of the operands (LHS)
+    // if and only if the other operand (RHS) is negative, otherwise there will
+    // be overflow.
+    SDValue ResultLowerThanLHS =
+        DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
+    SDValue ConditionRHS = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETLT);
+    Overflow = DAG.getBoolExtOrTrunc(
+        DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
+        ResultType, ResultType);
+  } else {
+    // For subtraction, overflow occurs when the signed comparison of operands
+    // doesn't match the sign of the result
+    SDValue LHSLessThanRHS = DAG.getSetCC(dl, OType, LHS, RHS, ISD::SETLT);
+    SDValue ResultNegative = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETLT);
+    Overflow = DAG.getBoolExtOrTrunc(
+        DAG.getNode(ISD::XOR, dl, OType, LHSLessThanRHS, ResultNegative), dl,
+        ResultType, ResultType);
+  }
 }
 
 bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 607edd3d859f8..88a92094f4772 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14497,6 +14497,35 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(customLegalizeToWOp(N, DAG, ExtOpc));
     break;
   }
+  case ISD::SSUBO: {
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+
+    // If the RHS is a constant, we can simplify ConditionRHS below. Otherwise
+    // use the default legalization.
+    if (!isa<ConstantSDNode>(N->getOperand(1)))
+      return;
+
+    SDValue LHS = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(0));
+    SDValue RHS = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(1));
+    SDValue Res = DAG.getNode(ISD::SUB, DL, MVT::i64, LHS, RHS);
+    Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Res,
+                      DAG.getValueType(MVT::i32));
+
+    SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+
+    // For subtraction, overflow occurs when the signed comparison of operands
+    // doesn't match the sign of the result
+    EVT OType = N->getValueType(1);
+    SDValue LHSLessThanRHS = DAG.getSetCC(DL, OType, LHS, RHS, ISD::SETLT);
+    SDValue ResultNegative = DAG.getSetCC(DL, OType, Res, Zero, ISD::SETLT);
+    SDValue Overflow =
+        DAG.getNode(ISD::XOR, DL, OType, LHSLessThanRHS, ResultNegative);
+
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+    Results.push_back(Overflow);
+    return;
+  }
   case ISD::SADDO: {
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 40d80f5e83e36..24ad6af504b7b 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -80,13 +80,13 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8-LABEL: v_ssubsat_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i16:
@@ -120,25 +120,25 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
-; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v0, v1
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v1
+; GFX6-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v1
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v0, v1
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v1
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i32:
@@ -181,21 +181,21 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_sub_u16_e32 v4, v3, v2
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
-; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_sub_u16_e32 v2, v3, v2
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0xffff8000, v3
+; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -244,27 +244,27 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
-; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
-; GFX8-NEXT:    v_xor_b32_e32 v4, 0xffff8000, v4
-; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v3
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_sub_u16_e32 v4, v5, v4
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 15, v4
+; GFX8-NEXT:    v_xor_b32_e32 v5, 0xffff8000, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v3
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0xffff8000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v2
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v0
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -321,39 +321,39 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
-; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
-; GFX8-NEXT:    v_xor_b32_e32 v4, 0xffff8000, v4
-; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_sub_u16_e32 v4, v5, v4
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 15, v4
+; GFX8-NEXT:    v_xor_b32_e32 v5, 0xffff8000, v5
+; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v2
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v0
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_sub_u16_e32 v5, v4, v2
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
-; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v3
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v2
+; GFX8-NEXT:    v_sub_u16_e32 v2, v4, v2
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0xffff8000, v4
+; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v3
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0xffff8000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -379,39 +379,39 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v2i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v0, v2
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v2
+; GFX6-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v2
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
+; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v3
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v3
+; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v3
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v0, v2
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v2
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v1, v3
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v3
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i32:
@@ -435,53 +435,53 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v3i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
-; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v3
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v3
+; GFX6-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v3
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v4
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v4
+; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v4
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v2, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v5
+; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v5
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v3i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v0, v3
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v3
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v1, v4
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v4
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v2, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v5
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v3i32:
@@ -507,67 +507,67 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v0, v4
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v4
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v1, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v5
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v6
+; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v6
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
+; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v7
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v7
+; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v7
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v4i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v0, v4
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v4
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v1, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v5
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v2, v6
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v6
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v7
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v7
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v4i32:
@@ -595,123 +595,123 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v8i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v8
-; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v8
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v8
+; GFX6-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v8
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v1, v9
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v9
+; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v9
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v10
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v10
+; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v10
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v11
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v11
+; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v11
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v4, v12
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v12
+; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v4, v12
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v4
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v13
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
-; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v5, v13
+; GFX6-NEXT:    v_sub_i32_e64 v5, s[4:5], v5, v13
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v5
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v5
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v6, v14
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
-; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v6, v14
+; GFX6-NEXT:    v_sub_i32_e64 v6, s[4:5], v6, v14
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v6
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v6
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v15
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
-; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v7, v15
+; GFX6-NEXT:    v_sub_i32_e64 v7, s[4:5], v7, v15
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v7
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v8i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v8
-; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v0, v8
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v8
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v8
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v1, v9
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v9
+; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v9
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v2, v10
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v10
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v10
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v3, v11
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v11
+; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v11
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v4, v12
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v12
+; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v12
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v4
+; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v5, v13
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v5, v13
+; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v13
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v5
+; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v6, v14
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
-; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v6, v14
+; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v14
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v6
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v6
+; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v7, v15
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v7, v15
+; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v15
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v7
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v8i32:
@@ -747,239 +747,239 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v16i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
-; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v0, v16
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v16
+; GFX6-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v16
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v0
+; GFX6-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v1, v17
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v17
+; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v17
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v1
+; GFX6-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v2, v18
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v16, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v18
+; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v18
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v2
+; GFX6-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v3, v19
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v19
+; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v19
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v3
+; GFX6-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v4, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v16, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v20
+; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v4, v20
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
+; GFX6-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v5, v21
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5
-; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v5, v21
+; GFX6-NEXT:    v_sub_i32_e64 v5, s[4:5], v5, v21
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v5
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v5
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v6, v22
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v6
-; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v6, v22
+; GFX6-NEXT:    v_sub_i32_e64 v6, s[4:5], v6, v22
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v6
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v6
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v17, v6, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v7, v23
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v7
-; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v17, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v7, v23
+; GFX6-NEXT:    v_sub_i32_e64 v7, s[4:5], v7, v23
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v7
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v8, v24
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v8
-; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v8, v24
+; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v24
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v8
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v9, v25
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v9
-; GFX6-NEXT:    v_ashrrev_i32_e32 v9, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, v8, v17, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v9, v25
+; GFX6-NEXT:    v_sub_i32_e64 v9, s[4:5], v9, v25
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v9
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v9
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v10, v26
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v10
-; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v10, 0x80000000, v10
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v10, v26
+; GFX6-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v26
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v10
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v10
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v11, v27
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v11
-; GFX6-NEXT:    v_ashrrev_i32_e32 v11, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v11, 0x80000000, v11
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v11, v27
+; GFX6-NEXT:    v_sub_i32_e64 v11, s[4:5], v11, v27
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v11
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v11
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v12, v28
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v12
-; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v12, 0x80000000, v12
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v12, v28
+; GFX6-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v28
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v12
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v12
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, v17, v12, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v13, v29
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v13
-; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v13, 0x80000000, v13
+; GFX6-NEXT:    v_cndmask_b32_e32 v12, v12, v17, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v13, v29
+; GFX6-NEXT:    v_sub_i32_e64 v13, s[4:5], v13, v29
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v13
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v13
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v14, v30
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v14
-; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v14, 0x80000000, v14
+; GFX6-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v14, v30
+; GFX6-NEXT:    v_sub_i32_e64 v14, s[4:5], v14, v30
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v14
+; GFX6-NEXT:    v_ashrrev_i32_e32 v17, 31, v14
+; GFX6-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
-; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v15, v16
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
-; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, v15, v16
+; GFX6-NEXT:    v_sub_i32_e64 v15, s[4:5], v15, v16
+; GFX6-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v15
+; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX6-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v15, v15, v16, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v16i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v0, v16
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v16
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v16
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v0
+; GFX8-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v1, v17
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v17
+; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v17
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v1
+; GFX8-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v2, v18
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v16, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v18
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v18
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v2
+; GFX8-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v3, v19
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v19
+; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v19
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v3
+; GFX8-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v4, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v16, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v20
+; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v20
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
+; GFX8-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc
 ; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v5, v21
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v5, v21
+; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v21
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v5
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v6, v22
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v6
-; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v6, v22
+; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v22
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v6
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v6
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v17, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v7, v23
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v7
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v17, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v7, v23
+; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v23
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v7
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v8, v24
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v8
-; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v8, v24
+; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v8, v24
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v8
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v9, v25
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v9
-; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v17, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v9, v25
+; GFX8-NEXT:    v_sub_u32_e64 v9, s[4:5], v9, v25
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v9
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v9
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v10, v26
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v10
-; GFX8-NEXT:    v_ashrrev_i32_e32 v10, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v10, 0x80000000, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v10, v26
+; GFX8-NEXT:    v_sub_u32_e64 v10, s[4:5], v10, v26
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v10
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v10
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v11, v27
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v11
-; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v11, 0x80000000, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v11, v27
+; GFX8-NEXT:    v_sub_u32_e64 v11, s[4:5], v11, v27
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v11
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v11
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v12, v28
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v12
-; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v12, 0x80000000, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v12, v28
+; GFX8-NEXT:    v_sub_u32_e64 v12, s[4:5], v12, v28
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v12
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v12
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v12, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v13, v29
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v13
-; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v13, 0x80000000, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v17, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v13, v29
+; GFX8-NEXT:    v_sub_u32_e64 v13, s[4:5], v13, v29
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v13
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v13
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v14, v30
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v14
-; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v14, 0x80000000, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v14, v30
+; GFX8-NEXT:    v_sub_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v14
+; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v14
+; GFX8-NEXT:    v_xor_b32_e32 v17, 0x80000000, v17
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v15, v16
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
-; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v15, v16
+; GFX8-NEXT:    v_sub_u32_e64 v15, s[4:5], v15, v16
+; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v15
+; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX8-NEXT:    v_xor_b32_e32 v16, 0x80000000, v16
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v16, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v16i32:
@@ -1059,43 +1059,43 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
-; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
-; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
-; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX6-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v2
+; GFX6-NEXT:    v_subb_u32_e64 v1, s[4:5], v1, v3, s[4:5]
+; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[0:1]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
-; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
-; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v2
+; GFX8-NEXT:    v_subb_u32_e64 v1, s[4:5], v1, v3, s[4:5]
+; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[0:1]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
-; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[4:5], v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e64 v1, s[4:5], v1, v3, s[4:5]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[0:1]
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_i64:
@@ -1103,11 +1103,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[4:5]
 ; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
-; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1117,11 +1117,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[2:3]
+; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[4:5]
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
-; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
index 4efc224ab1ca7..f8a9c6ae0fbf4 100644
--- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
@@ -27,9 +27,9 @@ entry:
 define i1 @ssub(i32 %a, i32 %b, ptr %c) nounwind {
 ; RV32I-LABEL: ssub:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    sgtz a3, a1
+; RV32I-NEXT:    slt a3, a0, a1
 ; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    slt a0, a1, a0
+; RV32I-NEXT:    slti a0, a1, 0
 ; RV32I-NEXT:    xor a0, a3, a0
 ; RV32I-NEXT:    sw a1, 0(a2)
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/ssub_sat.ll b/llvm/test/CodeGen/RISCV/ssub_sat.ll
index ba4d170c719fc..5bffbbf13d09f 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat.ll
@@ -13,11 +13,10 @@ declare i64 @llvm.ssub.sat.i64(i64, i64)
 define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32-LABEL: func:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    sgtz a3, a1
+; RV32-NEXT:    slt a2, a0, a1
 ; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    slt a1, a0, a2
-; RV32-NEXT:    beq a3, a1, .LBB0_2
+; RV32-NEXT:    slti a1, a0, 0
+; RV32-NEXT:    beq a2, a1, .LBB0_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    srai a0, a0, 31
 ; RV32-NEXT:    lui a1, 524288
@@ -73,11 +72,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ;
 ; RV64-LABEL: func2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    mv a2, a0
-; RV64-NEXT:    sgtz a3, a1
+; RV64-NEXT:    slt a2, a0, a1
 ; RV64-NEXT:    sub a0, a0, a1
-; RV64-NEXT:    slt a1, a0, a2
-; RV64-NEXT:    beq a3, a1, .LBB1_2
+; RV64-NEXT:    slti a1, a0, 0
+; RV64-NEXT:    beq a2, a1, .LBB1_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    li a1, -1
diff --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
index 437c1e2a2e489..78cc2cb1eb4cf 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
@@ -13,12 +13,11 @@ declare i64 @llvm.ssub.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32-LABEL: func32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    mv a3, a0
-; RV32-NEXT:    mul a0, a1, a2
-; RV32-NEXT:    sgtz a1, a0
-; RV32-NEXT:    sub a0, a3, a0
-; RV32-NEXT:    slt a2, a0, a3
-; RV32-NEXT:    beq a1, a2, .LBB0_2
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slt a2, a0, a1
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    slti a1, a0, 0
+; RV32-NEXT:    beq a2, a1, .LBB0_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    srai a0, a0, 31
 ; RV32-NEXT:    lui a1, 524288
@@ -77,11 +76,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ;
 ; RV64-LABEL: func64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    sgtz a3, a2
+; RV64-NEXT:    slt a1, a0, a2
 ; RV64-NEXT:    sub a0, a0, a2
-; RV64-NEXT:    slt a1, a0, a1
-; RV64-NEXT:    beq a3, a1, .LBB1_2
+; RV64-NEXT:    slti a2, a0, 0
+; RV64-NEXT:    beq a1, a2, .LBB1_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    li a1, -1
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index a30593d7d7afb..699e791fccf48 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -748,9 +748,9 @@ entry:
 define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32-LABEL: ssubo1.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sgtz a3, a1
+; RV32-NEXT:    slt a3, a0, a1
 ; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    slt a0, a1, a0
+; RV32-NEXT:    slti a0, a1, 0
 ; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    sw a1, 0(a2)
 ; RV32-NEXT:    ret
@@ -766,9 +766,9 @@ define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ;
 ; RV32ZBA-LABEL: ssubo1.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sgtz a3, a1
+; RV32ZBA-NEXT:    slt a3, a0, a1
 ; RV32ZBA-NEXT:    sub a1, a0, a1
-; RV32ZBA-NEXT:    slt a0, a1, a0
+; RV32ZBA-NEXT:    slti a0, a1, 0
 ; RV32ZBA-NEXT:    xor a0, a3, a0
 ; RV32ZBA-NEXT:    sw a1, 0(a2)
 ; RV32ZBA-NEXT:    ret
@@ -784,9 +784,9 @@ define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ;
 ; RV32ZICOND-LABEL: ssubo1.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sgtz a3, a1
+; RV32ZICOND-NEXT:    slt a3, a0, a1
 ; RV32ZICOND-NEXT:    sub a1, a0, a1
-; RV32ZICOND-NEXT:    slt a0, a1, a0
+; RV32ZICOND-NEXT:    slti a0, a1, 0
 ; RV32ZICOND-NEXT:    xor a0, a3, a0
 ; RV32ZICOND-NEXT:    sw a1, 0(a2)
 ; RV32ZICOND-NEXT:    ret
@@ -874,9 +874,9 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64-LABEL: ssubo.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sgtz a3, a1
+; RV64-NEXT:    slt a3, a0, a1
 ; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    slti a0, a1, 0
 ; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    sd a1, 0(a2)
 ; RV64-NEXT:    ret
@@ -897,9 +897,9 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64ZBA-LABEL: ssubo.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sgtz a3, a1
+; RV64ZBA-NEXT:    slt a3, a0, a1
 ; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    slt a0, a1, a0
+; RV64ZBA-NEXT:    slti a0, a1, 0
 ; RV64ZBA-NEXT:    xor a0, a3, a0
 ; RV64ZBA-NEXT:    sd a1, 0(a2)
 ; RV64ZBA-NEXT:    ret
@@ -920,9 +920,9 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64ZICOND-LABEL: ssubo.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sgtz a3, a1
+; RV64ZICOND-NEXT:    slt a3, a0, a1
 ; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    slt a0, a1, a0
+; RV64ZICOND-NEXT:    slti a0, a1, 0
 ; RV64ZICOND-NEXT:    xor a0, a3, a0
 ; RV64ZICOND-NEXT:    sd a1, 0(a2)
 ; RV64ZICOND-NEXT:    ret
@@ -2527,9 +2527,9 @@ entry:
 define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: ssubo.select.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sgtz a2, a1
+; RV32-NEXT:    slt a2, a0, a1
 ; RV32-NEXT:    sub a3, a0, a1
-; RV32-NEXT:    slt a3, a3, a0
+; RV32-NEXT:    slti a3, a3, 0
 ; RV32-NEXT:    bne a2, a3, .LBB36_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    mv a0, a1
@@ -2548,9 +2548,9 @@ define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: ssubo.select.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sgtz a2, a1
+; RV32ZBA-NEXT:    slt a2, a0, a1
 ; RV32ZBA-NEXT:    sub a3, a0, a1
-; RV32ZBA-NEXT:    slt a3, a3, a0
+; RV32ZBA-NEXT:    slti a3, a3, 0
 ; RV32ZBA-NEXT:    bne a2, a3, .LBB36_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    mv a0, a1
@@ -2569,9 +2569,9 @@ define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: ssubo.select.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sgtz a2, a1
+; RV32ZICOND-NEXT:    slt a2, a0, a1
 ; RV32ZICOND-NEXT:    sub a3, a0, a1
-; RV32ZICOND-NEXT:    slt a3, a3, a0
+; RV32ZICOND-NEXT:    slti a3, a3, 0
 ; RV32ZICOND-NEXT:    xor a2, a2, a3
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
@@ -2597,9 +2597,9 @@ entry:
 define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: ssubo.not.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sgtz a2, a1
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    slt a0, a1, a0
+; RV32-NEXT:    slt a2, a0, a1
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    slti a0, a0, 0
 ; RV32-NEXT:    xor a0, a2, a0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
@@ -2614,9 +2614,9 @@ define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: ssubo.not.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sgtz a2, a1
-; RV32ZBA-NEXT:    sub a1, a0, a1
-; RV32ZBA-NEXT:    slt a0, a1, a0
+; RV32ZBA-NEXT:    slt a2, a0, a1
+; RV32ZBA-NEXT:    sub a0, a0, a1
+; RV32ZBA-NEXT:    slti a0, a0, 0
 ; RV32ZBA-NEXT:    xor a0, a2, a0
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
@@ -2631,9 +2631,9 @@ define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: ssubo.not.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sgtz a2, a1
-; RV32ZICOND-NEXT:    sub a1, a0, a1
-; RV32ZICOND-NEXT:    slt a0, a1, a0
+; RV32ZICOND-NEXT:    slt a2, a0, a1
+; RV32ZICOND-NEXT:    sub a0, a0, a1
+; RV32ZICOND-NEXT:    slti a0, a0, 0
 ; RV32ZICOND-NEXT:    xor a0, a2, a0
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
@@ -2670,9 +2670,9 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: ssubo.select.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    slt a2, a0, a1
 ; RV64-NEXT:    sub a3, a0, a1
-; RV64-NEXT:    slt a3, a3, a0
+; RV64-NEXT:    slti a3, a3, 0
 ; RV64-NEXT:    bne a2, a3, .LBB38_2
 ; RV64-NEXT:  # %bb.1: # %entry
 ; RV64-NEXT:    mv a0, a1
@@ -2696,9 +2696,9 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: ssubo.select.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sgtz a2, a1
+; RV64ZBA-NEXT:    slt a2, a0, a1
 ; RV64ZBA-NEXT:    sub a3, a0, a1
-; RV64ZBA-NEXT:    slt a3, a3, a0
+; RV64ZBA-NEXT:    slti a3, a3, 0
 ; RV64ZBA-NEXT:    bne a2, a3, .LBB38_2
 ; RV64ZBA-NEXT:  # %bb.1: # %entry
 ; RV64ZBA-NEXT:    mv a0, a1
@@ -2724,9 +2724,9 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: ssubo.select.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sgtz a2, a1
+; RV64ZICOND-NEXT:    slt a2, a0, a1
 ; RV64ZICOND-NEXT:    sub a3, a0, a1
-; RV64ZICOND-NEXT:    slt a3, a3, a0
+; RV64ZICOND-NEXT:    slti a3, a3, 0
 ; RV64ZICOND-NEXT:    xor a2, a2, a3
 ; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
@@ -2754,9 +2754,9 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: ssub.not.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sgtz a2, a1
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    slt a2, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    slti a0, a0, 0
 ; RV64-NEXT:    xor a0, a2, a0
 ; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
@@ -2775,9 +2775,9 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: ssub.not.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sgtz a2, a1
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    slt a0, a1, a0
+; RV64ZBA-NEXT:    slt a2, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
+; RV64ZBA-NEXT:    slti a0, a0, 0
 ; RV64ZBA-NEXT:    xor a0, a2, a0
 ; RV64ZBA-NEXT:    xori a0, a0, 1
 ; RV64ZBA-NEXT:    ret
@@ -2796,9 +2796,9 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: ssub.not.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sgtz a2, a1
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    slt a0, a1, a0
+; RV64ZICOND-NEXT:    slt a2, a0, a1
+; RV64ZICOND-NEXT:    sub a0, a0, a1
+; RV64ZICOND-NEXT:    slti a0, a0, 0
 ; RV64ZICOND-NEXT:    xor a0, a2, a0
 ; RV64ZICOND-NEXT:    xori a0, a0, 1
 ; RV64ZICOND-NEXT:    ret
@@ -4196,9 +4196,9 @@ continue:
 define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: ssubo.br.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sgtz a2, a1
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    slt a0, a1, a0
+; RV32-NEXT:    slt a2, a0, a1
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    slti a0, a0, 0
 ; RV32-NEXT:    beq a2, a0, .LBB56_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
@@ -4221,9 +4221,9 @@ define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: ssubo.br.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sgtz a2, a1
-; RV32ZBA-NEXT:    sub a1, a0, a1
-; RV32ZBA-NEXT:    slt a0, a1, a0
+; RV32ZBA-NEXT:    slt a2, a0, a1
+; RV32ZBA-NEXT:    sub a0, a0, a1
+; RV32ZBA-NEXT:    slti a0, a0, 0
 ; RV32ZBA-NEXT:    beq a2, a0, .LBB56_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
@@ -4246,9 +4246,9 @@ define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: ssubo.br.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sgtz a2, a1
-; RV32ZICOND-NEXT:    sub a1, a0, a1
-; RV32ZICOND-NEXT:    slt a0, a1, a0
+; RV32ZICOND-NEXT:    slt a2, a0, a1
+; RV32ZICOND-NEXT:    sub a0, a0, a1
+; RV32ZICOND-NEXT:    slti a0, a0, 0
 ; RV32ZICOND-NEXT:    beq a2, a0, .LBB56_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
@@ -4300,9 +4300,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: ssubo.br.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sgtz a2, a1
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    slt a2, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    slti a0, a0, 0
 ; RV64-NEXT:    beq a2, a0, .LBB57_2
 ; RV64-NEXT:  # %bb.1: # %overflow
 ; RV64-NEXT:    li a0, 0
@@ -4329,9 +4329,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: ssubo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sgtz a2, a1
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    slt a0, a1, a0
+; RV64ZBA-NEXT:    slt a2, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
+; RV64ZBA-NEXT:    slti a0, a0, 0
 ; RV64ZBA-NEXT:    beq a2, a0, .LBB57_2
 ; RV64ZBA-NEXT:  # %bb.1: # %overflow
 ; RV64ZBA-NEXT:    li a0, 0
@@ -4358,9 +4358,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: ssubo.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sgtz a2, a1
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    slt a0, a1, a0
+; RV64ZICOND-NEXT:    slt a2, a0, a1
+; RV64ZICOND-NEXT:    sub a0, a0, a1
+; RV64ZICOND-NEXT:    slti a0, a0, 0
 ; RV64ZICOND-NEXT:    beq a2, a0, .LBB57_2
 ; RV64ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV64ZICOND-NEXT:    li a0, 0
diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll
index c75bb9daefcf2..29beb221797f2 100644
--- a/llvm/test/CodeGen/RISCV/xqcia.ll
+++ b/llvm/test/CodeGen/RISCV/xqcia.ll
@@ -48,11 +48,10 @@ define i32 @addusat(i32 %a, i32 %b) {
 define i32 @subsat(i32 %a, i32 %b) {
 ; RV32I-LABEL: subsat:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    mv a2, a0
-; RV32I-NEXT:    sgtz a3, a1
+; RV32I-NEXT:    slt a2, a0, a1
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    slt a1, a0, a2
-; RV32I-NEXT:    beq a3, a1, .LBB2_2
+; RV32I-NEXT:    slti a1, a0, 0
+; RV32I-NEXT:    beq a2, a1, .LBB2_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    lui a1, 524288
diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index bbc0ff9bd1be5..d566d0ddce6ba 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -179,47 +179,40 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: ssub_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    vmov r2, r3, d2
-; CHECK-NEXT:    vmov r1, r0, d0
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    mov.w r12, #1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vmov r4, r5, d1
-; CHECK-NEXT:    subs.w r12, r1, r2
-; CHECK-NEXT:    sbc.w lr, r0, r3
-; CHECK-NEXT:    subs.w r1, r12, r1
-; CHECK-NEXT:    sbcs.w r0, lr, r0
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    sbcs.w r2, r1, r3
+; CHECK-NEXT:    subs.w lr, r2, r0
+; CHECK-NEXT:    sbcs.w r1, r3, r1
+; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    lsr.w r2, r1, #31
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    eorlt r0, r0, #1
-; CHECK-NEXT:    vmov r2, r3, d3
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    subs r6, r4, r2
-; CHECK-NEXT:    sbc.w r7, r5, r3
-; CHECK-NEXT:    subs r4, r6, r4
-; CHECK-NEXT:    sbcs.w r4, r7, r5
-; CHECK-NEXT:    vmov q0[2], q0[0], r12, r6
-; CHECK-NEXT:    cset r4, lt
+; CHECK-NEXT:    eorlt.w r2, r12, r1, lsr #31
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    sbcs.w r2, r1, r3
-; CHECK-NEXT:    bfi r1, r0, #0, #8
+; CHECK-NEXT:    bfi r3, r2, #0, #8
+; CHECK-NEXT:    vmov r2, r0, d3
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    sbcs.w r0, r5, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    asr.w r1, r1, #31
+; CHECK-NEXT:    lsr.w r4, r0, #31
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    eorlt r4, r4, #1
-; CHECK-NEXT:    rsbs r0, r4, #0
-; CHECK-NEXT:    bfi r1, r0, #8, #8
-; CHECK-NEXT:    asrs r0, r7, #31
-; CHECK-NEXT:    vmsr p0, r1
-; CHECK-NEXT:    asr.w r1, lr, #31
+; CHECK-NEXT:    eorlt.w r4, r12, r0, lsr #31
+; CHECK-NEXT:    asrs r0, r0, #31
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], lr, r7
+; CHECK-NEXT:    rsbs r5, r4, #0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    adr r0, .LCPI11_0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    bfi r3, r5, #8, #8
+; CHECK-NEXT:    vmsr p0, r3
 ; CHECK-NEXT:    veor q1, q1, q2
 ; CHECK-NEXT:    vpsel q0, q1, q0
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI11_0:
diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
index dbfa69d497698..09c7c4b7a26f6 100644
--- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
@@ -1821,67 +1821,60 @@ declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define <4 x i32> @vp_ssub_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
 ; X86-LABEL: vp_ssub_sat_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; X86-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; X86-NEXT:    vpsrad $31, %xmm1, %xmm2
-; X86-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2
-; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; X86-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; X86-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpsrad $31, %xmm2, %xmm1
+; X86-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: vp_ssub_sat_v4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psubd %xmm1, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pandn %xmm3, %xmm1
-; SSE-NEXT:    psrad $31, %xmm3
-; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    psrad $31, %xmm0
+; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: vp_ssub_sat_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: vp_ssub_sat_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: vp_ssub_sat_v4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
-; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpsrad $31, %xmm1, %xmm0
-; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
-; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
   %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
   ret <4 x i32> %v
diff --git a/llvm/test/CodeGen/X86/ssub_sat.ll b/llvm/test/CodeGen/X86/ssub_sat.ll
index 8ecc8b39ac468..ee6b60c075630 100644
--- a/llvm/test/CodeGen/X86/ssub_sat.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat.ll
@@ -207,18 +207,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X64-LABEL: vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    pxor %xmm2, %xmm2
-; X64-NEXT:    movdqa %xmm0, %xmm3
-; X64-NEXT:    psubd %xmm1, %xmm3
-; X64-NEXT:    pcmpgtd %xmm2, %xmm1
-; X64-NEXT:    pcmpgtd %xmm3, %xmm0
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    movdqa %xmm0, %xmm1
-; X64-NEXT:    pandn %xmm3, %xmm1
-; X64-NEXT:    psrad $31, %xmm3
-; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-NEXT:    pand %xmm3, %xmm0
-; X64-NEXT:    por %xmm1, %xmm0
+; X64-NEXT:    movdqa %xmm1, %xmm2
+; X64-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-NEXT:    psubd %xmm1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pcmpgtd %xmm0, %xmm1
+; X64-NEXT:    pxor %xmm2, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm2
+; X64-NEXT:    pandn %xmm0, %xmm2
+; X64-NEXT:    psrad $31, %xmm0
+; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
   %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index eb2ad4fdff92f..bdac954031fb5 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -612,99 +612,91 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-LABEL: v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pandn %xmm3, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm3
-; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psubd %xmm1, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v2i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v2i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v2i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512F-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpsrad $31, %xmm2, %xmm1
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v2i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
-; AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512BW-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0
+; AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %z
@@ -713,99 +705,91 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-LABEL: v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pandn %xmm3, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm3
-; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psubd %xmm1, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v4i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512F-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpsrad $31, %xmm2, %xmm1
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v4i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
-; AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512BW-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0
+; AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %z
@@ -814,145 +798,136 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm2, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pandn %xmm5, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    pxor %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm0
 ; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubd %xmm3, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    psubd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm6, %xmm1
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    pandn %xmm5, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm5
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    psubd %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pandn %xmm0, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    pxor %xmm6, %xmm5
+; SSSE3-NEXT:    pxor %xmm6, %xmm0
 ; SSSE3-NEXT:    pand %xmm5, %xmm0
-; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    psubd %xmm3, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pandn %xmm2, %xmm3
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm2
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    psubd %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm1, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm6, %xmm1
 ; SSSE3-NEXT:    pand %xmm2, %xmm1
 ; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm1, %xmm5
-; SSE41-NEXT:    pxor %xmm6, %xmm6
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    psubd %xmm2, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    pxor %xmm2, %xmm1
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm4
-; SSE41-NEXT:    movdqa %xmm5, %xmm1
-; SSE41-NEXT:    psubd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE41-NEXT:    pxor %xmm3, %xmm5
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    psrad $31, %xmm5
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm6, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psubd %xmm3, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSE41-NEXT:    pxor %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor %xmm6, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movaps %xmm4, %xmm0
+; SSE41-NEXT:    movaps %xmm2, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm6
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm6, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrad $31, %xmm5, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm3, %ymm0
+; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm6, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm2
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v8i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vpsrad $31, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrad $31, %ymm2, %ymm1
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vpxor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX512F-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v8i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
-; AVX512BW-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512BW-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; AVX512BW-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpsrad $31, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    vpsrad $31, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -961,223 +936,215 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSE2-LABEL: v16i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm8, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    psubd %xmm4, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NEXT:    pandn %xmm9, %xmm10
-; SSE2-NEXT:    psrad $31, %xmm9
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm8
+; SSE2-NEXT:    psubd %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm10
+; SSE2-NEXT:    pandn %xmm0, %xmm10
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm8, %xmm0
 ; SSE2-NEXT:    pand %xmm9, %xmm0
 ; SSE2-NEXT:    por %xmm10, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NEXT:    psubd %xmm5, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pandn %xmm9, %xmm5
-; SSE2-NEXT:    psrad $31, %xmm9
-; SSE2-NEXT:    pxor %xmm4, %xmm9
-; SSE2-NEXT:    pand %xmm9, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    psubd %xmm6, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pandn %xmm5, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    psubd %xmm7, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT:    pxor %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    pandn %xmm5, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm3
-; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm9
+; SSE2-NEXT:    psubd %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    pandn %xmm1, %xmm9
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    psubd %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    psubd %xmm7, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v16i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm8, %xmm8
-; SSSE3-NEXT:    movdqa %xmm0, %xmm9
-; SSSE3-NEXT:    psubd %xmm4, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm10
-; SSSE3-NEXT:    pandn %xmm9, %xmm10
-; SSSE3-NEXT:    psrad $31, %xmm9
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    pxor %xmm4, %xmm9
+; SSSE3-NEXT:    movdqa %xmm4, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm8
+; SSSE3-NEXT:    psubd %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm9, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm9
+; SSSE3-NEXT:    pxor %xmm8, %xmm9
+; SSSE3-NEXT:    movdqa %xmm9, %xmm10
+; SSSE3-NEXT:    pandn %xmm0, %xmm10
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
 ; SSSE3-NEXT:    pand %xmm9, %xmm0
 ; SSSE3-NEXT:    por %xmm10, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm9
-; SSSE3-NEXT:    psubd %xmm5, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT:    pxor %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm5
-; SSSE3-NEXT:    pandn %xmm9, %xmm5
-; SSSE3-NEXT:    psrad $31, %xmm9
-; SSSE3-NEXT:    pxor %xmm4, %xmm9
-; SSSE3-NEXT:    pand %xmm9, %xmm1
-; SSSE3-NEXT:    por %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    psubd %xmm6, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pandn %xmm5, %xmm6
-; SSSE3-NEXT:    psrad $31, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm5, %xmm2
-; SSSE3-NEXT:    por %xmm6, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; SSSE3-NEXT:    psubd %xmm7, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm3
-; SSSE3-NEXT:    pxor %xmm7, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm6
-; SSSE3-NEXT:    pandn %xmm5, %xmm6
-; SSSE3-NEXT:    psrad $31, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm5, %xmm3
-; SSSE3-NEXT:    por %xmm6, %xmm3
+; SSSE3-NEXT:    movdqa %xmm5, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm9
+; SSSE3-NEXT:    psubd %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm9
+; SSSE3-NEXT:    pandn %xmm1, %xmm9
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    por %xmm9, %xmm1
+; SSSE3-NEXT:    movdqa %xmm6, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT:    psubd %xmm6, %xmm2
+; SSSE3-NEXT:    pxor %xmm6, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
+; SSSE3-NEXT:    movdqa %xmm6, %xmm5
+; SSSE3-NEXT:    pandn %xmm2, %xmm5
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm7, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    psubd %xmm7, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pandn %xmm3, %xmm5
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pxor %xmm8, %xmm3
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    por %xmm5, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v16i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm3, %xmm11
-; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    movdqa %xmm4, %xmm8
 ; SSE41-NEXT:    movdqa %xmm1, %xmm9
-; SSE41-NEXT:    pxor %xmm12, %xmm12
-; SSE41-NEXT:    movdqa %xmm0, %xmm8
-; SSE41-NEXT:    psubd %xmm4, %xmm8
-; SSE41-NEXT:    pcmpgtd %xmm12, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm8, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psubd %xmm8, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm8
+; SSE41-NEXT:    pxor %xmm4, %xmm8
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    pxor %xmm4, %xmm1
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm10, %xmm1
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm4
 ; SSE41-NEXT:    movdqa %xmm9, %xmm1
 ; SSE41-NEXT:    psubd %xmm5, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm12, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm9
-; SSE41-NEXT:    pxor %xmm5, %xmm9
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSE41-NEXT:    pxor %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm8
+; SSE41-NEXT:    psrad $31, %xmm8
+; SSE41-NEXT:    pxor %xmm10, %xmm8
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm8, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    psubd %xmm6, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
 ; SSE41-NEXT:    psrad $31, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm10, %xmm2
-; SSE41-NEXT:    psubd %xmm6, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm12, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm10
-; SSE41-NEXT:    pxor %xmm6, %xmm10
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psrad $31, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    movdqa %xmm10, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    movdqa %xmm11, %xmm3
-; SSE41-NEXT:    psubd %xmm7, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm12, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm11
-; SSE41-NEXT:    pxor %xmm7, %xmm11
-; SSE41-NEXT:    movdqa %xmm3, %xmm5
-; SSE41-NEXT:    psrad $31, %xmm5
-; SSE41-NEXT:    pxor %xmm4, %xmm5
-; SSE41-NEXT:    movdqa %xmm11, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm3
-; SSE41-NEXT:    movaps %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm2
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    psubd %xmm7, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm7
+; SSE41-NEXT:    pxor %xmm6, %xmm7
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pxor %xmm10, %xmm2
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm6
+; SSE41-NEXT:    movaps %xmm4, %xmm0
+; SSE41-NEXT:    movaps %xmm5, %xmm2
+; SSE41-NEXT:    movaps %xmm6, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v16i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm4, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpsubd %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm6, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm6
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm8
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT:    vpsrad $31, %xmm7, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm6, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
 ; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm6, %ymm0
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm8, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm3, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vpsubd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm5, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm5
-; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm6
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm8
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm8, %ymm1
+; AVX1-NEXT:    vpsrad $31, %xmm7, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm6, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm5, %ymm1
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm8, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v16i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm2, %ymm5
-; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm5
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vblendvps %ymm0, %ymm5, %ymm2, %ymm0
-; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm3, %ymm2
-; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpsrad $31, %ymm3, %ymm2
-; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm1
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm4, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm4, %ymm0
+; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm3
+; AVX2-NEXT:    vpxor %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vblendvps %ymm1, %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
-; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpsrad $31, %zmm1, %zmm0
-; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    vpsrad $31, %zmm0, %zmm1
+; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
   %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
@@ -1189,32 +1156,24 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -1224,32 +1183,24 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    psubq %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSSE3-NEXT:    pxor %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    psubq %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    pandn %xmm0, %xmm1
 ; SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
@@ -1260,23 +1211,15 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    por %xmm0, %xmm3
 ; SSE41-NEXT:    psubq %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    pxor %xmm0, %xmm4
-; SSE41-NEXT:    movdqa %xmm3, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm5, %xmm4
-; SSE41-NEXT:    por %xmm3, %xmm4
-; SSE41-NEXT:    pxor %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    por %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm3
 ; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1287,55 +1230,47 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ;
 ; AVX1-LABEL: v2i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    # xmm2 = mem[0,0]
-; AVX1-NEXT:    vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
+; AVX1-NEXT:    vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    # xmm2 = mem[0,0]
-; AVX2-NEXT:    vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    # xmm1 = mem[0,0]
+; AVX2-NEXT:    vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v2i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX512F-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX512F-NEXT:    # xmm2 = mem[0,0]
-; AVX512F-NEXT:    vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT:    # xmm1 = mem[0,0]
+; AVX512F-NEXT:    vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v2i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpcmpgtq %xmm2, %xmm1, %k0
-; AVX512BW-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtq %xmm1, %xmm2, %k2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 {%k2} = [9223372036854775807,9223372036854775807]
-; AVX512BW-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vpcmpgtq %xmm0, %xmm1, %k0
+; AVX512BW-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtq %xmm0, %xmm1, %k1
+; AVX512BW-NEXT:    kxorw %k1, %k0, %k2
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 {%k1} = [9223372036854775807,9223372036854775807]
+; AVX512BW-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k2}
 ; AVX512BW-NEXT:    retq
   %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %z
@@ -1348,62 +1283,46 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
 ; SSE2-NEXT:    pxor %xmm5, %xmm0
 ; SSE2-NEXT:    psubq %xmm2, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm6
 ; SSE2-NEXT:    pxor %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm7
-; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pxor %xmm6, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm7, %xmm0
-; SSE2-NEXT:    pandn %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm6, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    pandn %xmm4, %xmm0
 ; SSE2-NEXT:    por %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm2
-; SSE2-NEXT:    psubq %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    psubq %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pxor %xmm6, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
@@ -1413,62 +1332,46 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
 ; SSSE3-NEXT:    pxor %xmm5, %xmm0
 ; SSSE3-NEXT:    psubq %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    movdqa %xmm0, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm6
 ; SSSE3-NEXT:    pxor %xmm5, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm7
-; SSSE3-NEXT:    pxor %xmm6, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pxor %xmm6, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm7, %xmm0
-; SSSE3-NEXT:    pandn %xmm4, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm0
+; SSSE3-NEXT:    pxor %xmm6, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm7
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm6, %xmm7
+; SSSE3-NEXT:    pand %xmm0, %xmm7
+; SSSE3-NEXT:    pandn %xmm4, %xmm0
 ; SSSE3-NEXT:    por %xmm7, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm2
-; SSSE3-NEXT:    psubq %xmm3, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm5, %xmm4
-; SSSE3-NEXT:    movdqa %xmm2, %xmm7
+; SSSE3-NEXT:    pxor %xmm3, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm7
 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm3, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm2
-; SSSE3-NEXT:    pand %xmm5, %xmm2
-; SSSE3-NEXT:    pandn %xmm1, %xmm5
-; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm8, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    psubq %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pxor %xmm6, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm1, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
@@ -1478,113 +1381,90 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm6 = [2147483648,2147483648]
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    psubq %xmm2, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pxor %xmm6, %xmm5
-; SSE41-NEXT:    movdqa %xmm0, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm7, %xmm8
-; SSE41-NEXT:    por %xmm0, %xmm8
 ; SSE41-NEXT:    pxor %xmm6, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pand %xmm0, %xmm5
+; SSE41-NEXT:    pand %xmm7, %xmm5
 ; SSE41-NEXT:    por %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm8, %xmm5
-; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT:    movapd %xmm7, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movapd %xmm8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm2
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    por %xmm6, %xmm2
 ; SSE41-NEXT:    psubq %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    pxor %xmm6, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm5, %xmm9
-; SSE41-NEXT:    por %xmm0, %xmm9
-; SSE41-NEXT:    pxor %xmm6, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    por %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm9, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm8
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm1
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm5
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT:    vxorpd %ymm0, %ymm4, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm5
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT:    vxorpd %ymm1, %ymm4, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm4, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm0, %ymm3, %ymm0
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v4i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vpsraq $63, %zmm1, %zmm2
+; AVX512F-NEXT:    vpsubq %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsraq $63, %zmm2, %zmm1
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512F-NEXT:    vpxor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX512F-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v4i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpcmpgtq %ymm2, %ymm1, %k0
-; AVX512BW-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
+; AVX512BW-NEXT:    vpcmpgtq %ymm0, %ymm1, %k0
+; AVX512BW-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtq %ymm0, %ymm1, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpsraq $63, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    vpsraq $63, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -1598,122 +1478,90 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
 ; SSE2-NEXT:    pxor %xmm9, %xmm0
 ; SSE2-NEXT:    psubq %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    pxor %xmm9, %xmm10
-; SSE2-NEXT:    movdqa %xmm0, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm10
 ; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm11, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm11
-; SSE2-NEXT:    pxor %xmm10, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm11, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm11
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pand %xmm0, %xmm11
+; SSE2-NEXT:    pandn %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm11, %xmm0
 ; SSE2-NEXT:    movdqa %xmm8, %xmm1
 ; SSE2-NEXT:    pxor %xmm9, %xmm1
 ; SSE2-NEXT:    psubq %xmm5, %xmm8
-; SSE2-NEXT:    movdqa %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    movdqa %xmm1, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm11
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm12, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm11, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm11
-; SSE2-NEXT:    pxor %xmm4, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pxor %xmm10, %xmm1
-; SSE2-NEXT:    pand %xmm11, %xmm1
-; SSE2-NEXT:    pandn %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pand %xmm1, %xmm11
+; SSE2-NEXT:    pandn %xmm8, %xmm1
 ; SSE2-NEXT:    por %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm4
-; SSE2-NEXT:    psubq %xmm6, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    pxor %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm8
+; SSE2-NEXT:    psubq %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm9, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm8
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm11, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm8, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm8
-; SSE2-NEXT:    pxor %xmm5, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pxor %xmm10, %xmm4
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    pandn %xmm2, %xmm8
-; SSE2-NEXT:    por %xmm8, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm8
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    pand %xmm5, %xmm8
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm8, %xmm5
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    pxor %xmm9, %xmm2
-; SSE2-NEXT:    psubq %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm9, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm7, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT:    psrad $31, %xmm5
-; SSE2-NEXT:    pxor %xmm10, %xmm5
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm6
+; SSE2-NEXT:    psubq %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm10, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i64:
@@ -1723,122 +1571,90 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
 ; SSSE3-NEXT:    pxor %xmm9, %xmm0
 ; SSSE3-NEXT:    psubq %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm10
-; SSSE3-NEXT:    pxor %xmm9, %xmm10
-; SSSE3-NEXT:    movdqa %xmm0, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm10
 ; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm11, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm11
-; SSSE3-NEXT:    pxor %xmm10, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm4, %xmm10
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm11, %xmm0
-; SSSE3-NEXT:    pandn %xmm1, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm10
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm11
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm10, %xmm11
+; SSSE3-NEXT:    pand %xmm0, %xmm11
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    por %xmm11, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm8, %xmm1
 ; SSSE3-NEXT:    pxor %xmm9, %xmm1
 ; SSSE3-NEXT:    psubq %xmm5, %xmm8
-; SSSE3-NEXT:    movdqa %xmm8, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    movdqa %xmm1, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm11
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm12, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm11, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    por %xmm5, %xmm11
-; SSSE3-NEXT:    pxor %xmm4, %xmm11
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    pxor %xmm10, %xmm1
-; SSSE3-NEXT:    pand %xmm11, %xmm1
-; SSSE3-NEXT:    pandn %xmm8, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm1
+; SSSE3-NEXT:    pxor %xmm5, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm11
+; SSSE3-NEXT:    pxor %xmm10, %xmm11
+; SSSE3-NEXT:    pand %xmm1, %xmm11
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
 ; SSSE3-NEXT:    por %xmm11, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm9, %xmm4
-; SSSE3-NEXT:    psubq %xmm6, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
 ; SSSE3-NEXT:    pxor %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm4, %xmm8
+; SSSE3-NEXT:    psubq %xmm6, %xmm2
+; SSSE3-NEXT:    pxor %xmm9, %xmm6
+; SSSE3-NEXT:    movdqa %xmm6, %xmm8
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm8
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm11, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pxor %xmm9, %xmm6
-; SSSE3-NEXT:    movdqa %xmm6, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm8, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm8
-; SSSE3-NEXT:    pxor %xmm5, %xmm8
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    pxor %xmm10, %xmm4
-; SSSE3-NEXT:    pand %xmm8, %xmm4
-; SSSE3-NEXT:    pandn %xmm2, %xmm8
-; SSSE3-NEXT:    por %xmm8, %xmm4
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm11, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSSE3-NEXT:    pxor %xmm6, %xmm5
+; SSSE3-NEXT:    psrad $31, %xmm8
+; SSSE3-NEXT:    pxor %xmm10, %xmm8
+; SSSE3-NEXT:    pand %xmm5, %xmm8
+; SSSE3-NEXT:    pandn %xmm2, %xmm5
+; SSSE3-NEXT:    por %xmm8, %xmm5
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSSE3-NEXT:    pxor %xmm9, %xmm2
-; SSSE3-NEXT:    psubq %xmm7, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; SSSE3-NEXT:    pxor %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pxor %xmm7, %xmm9
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm8, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm9, %xmm7
-; SSSE3-NEXT:    movdqa %xmm7, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm7, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    psrad $31, %xmm5
-; SSSE3-NEXT:    pxor %xmm10, %xmm5
-; SSSE3-NEXT:    pand %xmm2, %xmm5
-; SSSE3-NEXT:    pandn %xmm3, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm5
-; SSSE3-NEXT:    movdqa %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm5, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm6
+; SSSE3-NEXT:    psubq %xmm7, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm6, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm10, %xmm2
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pandn %xmm3, %xmm4
+; SSSE3-NEXT:    por %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i64:
@@ -1847,22 +1663,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm10 = [2147483648,2147483648]
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
 ; SSE41-NEXT:    psubq %xmm4, %xmm8
-; SSE41-NEXT:    movdqa %xmm8, %xmm9
-; SSE41-NEXT:    pxor %xmm10, %xmm9
-; SSE41-NEXT:    movdqa %xmm0, %xmm11
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm11
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm11, %xmm12
-; SSE41-NEXT:    por %xmm0, %xmm12
 ; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm11
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm11
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pand %xmm0, %xmm9
+; SSE41-NEXT:    pand %xmm11, %xmm9
 ; SSE41-NEXT:    por %xmm4, %xmm9
-; SSE41-NEXT:    pxor %xmm12, %xmm9
+; SSE41-NEXT:    pxor %xmm8, %xmm9
 ; SSE41-NEXT:    movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
 ; SSE41-NEXT:    movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
 ; SSE41-NEXT:    movapd %xmm11, %xmm4
@@ -1873,22 +1681,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
 ; SSE41-NEXT:    psubq %xmm5, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    movdqa %xmm0, %xmm9
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm9, %xmm13
-; SSE41-NEXT:    por %xmm0, %xmm13
 ; SSE41-NEXT:    pxor %xmm10, %xmm5
-; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm9
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pand %xmm0, %xmm4
+; SSE41-NEXT:    pand %xmm9, %xmm4
 ; SSE41-NEXT:    por %xmm5, %xmm4
-; SSE41-NEXT:    pxor %xmm13, %xmm4
+; SSE41-NEXT:    pxor %xmm1, %xmm4
 ; SSE41-NEXT:    movapd %xmm11, %xmm5
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm5
@@ -1897,22 +1697,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
 ; SSE41-NEXT:    psubq %xmm6, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm5, %xmm9
-; SSE41-NEXT:    por %xmm0, %xmm9
 ; SSE41-NEXT:    pxor %xmm10, %xmm6
-; SSE41-NEXT:    movdqa %xmm6, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pand %xmm0, %xmm4
+; SSE41-NEXT:    pand %xmm5, %xmm4
 ; SSE41-NEXT:    por %xmm6, %xmm4
-; SSE41-NEXT:    pxor %xmm9, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm4
 ; SSE41-NEXT:    movapd %xmm11, %xmm5
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm5
@@ -1920,23 +1712,15 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm10
+; SSE41-NEXT:    movdqa %xmm10, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pand %xmm5, %xmm4
+; SSE41-NEXT:    por %xmm10, %xmm4
 ; SSE41-NEXT:    psubq %xmm7, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pxor %xmm10, %xmm4
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    por %xmm0, %xmm6
-; SSE41-NEXT:    pxor %xmm10, %xmm7
-; SSE41-NEXT:    movdqa %xmm7, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pand %xmm0, %xmm4
-; SSE41-NEXT:    por %xmm7, %xmm4
-; SSE41-NEXT:    pxor %xmm6, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm11
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
@@ -1946,74 +1730,65 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ;
 ; AVX1-LABEL: v8i64:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpsubq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX1-NEXT:    vxorpd %ymm0, %ymm6, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm6
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorpd %ymm5, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm6, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm7
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpsubq %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm7
-; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX1-NEXT:    vxorpd %ymm1, %ymm6, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm6
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vxorpd %ymm5, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm6, %ymm1
+; AVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm7
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm5, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm2
+; AVX1-NEXT:    vxorpd %ymm2, %ymm6, %ymm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorpd %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm6, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm8
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm5, %xmm6
+; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT:    vxorpd %ymm3, %ymm7, %ymm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorpd %ymm4, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm5
-; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm5
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm2, %ymm0
-; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm4, %ymm0
+; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm2
 ; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm4, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v8i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtq %zmm2, %zmm1, %k0
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm1
-; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpsraq $63, %zmm1, %zmm0
-; AVX512-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
+; AVX512-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
   %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 746c09e5e70db..57811c0eb8233 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -43,20 +43,21 @@ define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
 define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
 ; SSE-LABEL: ssubo_v2i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psubd %xmm1, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm3, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    movq %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: ssubo_v2i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vmovq %xmm1, (%rdi)
@@ -64,9 +65,9 @@ define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: ssubo_v2i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0
 ; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -84,47 +85,50 @@ define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
 define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: ssubo_v3i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movq %xmm3, (%rdi)
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movd %xmm1, 8(%rdi)
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movd %xmm0, 8(%rdi)
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: ssubo_v3i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm0
-; SSSE3-NEXT:    movq %xmm3, (%rdi)
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSSE3-NEXT:    movd %xmm0, 8(%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ssubo_v3i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    pextrd $2, %xmm3, 8(%rdi)
-; SSE41-NEXT:    movq %xmm3, (%rdi)
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: ssubo_v3i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
@@ -133,9 +137,9 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: ssubo_v3i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0
 ; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -154,20 +158,21 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 ; SSE-LABEL: ssubo_v4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psubd %xmm1, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm3, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: ssubo_v4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
@@ -175,9 +180,9 @@ define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: ssubo_v4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0
 ; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -199,39 +204,40 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT:    movd %r8d, %xmm0
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    movd %r8d, %xmm1
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd %edx, %xmm1
-; SSE2-NEXT:    movd %esi, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; SSE2-NEXT:    movd %r9d, %xmm1
 ; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movq %xmm3, 16(%rcx)
-; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    psubd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movq %xmm1, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
 ; SSE2-NEXT:    movq %xmm2, 16(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm5, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: ssubo_v6i32:
@@ -240,97 +246,99 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT:    movd %r8d, %xmm0
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd %edx, %xmm3
+; SSSE3-NEXT:    movd %esi, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT:    movd %r8d, %xmm1
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT:    movd %edx, %xmm1
-; SSSE3-NEXT:    movd %esi, %xmm3
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; SSSE3-NEXT:    movd %r9d, %xmm1
 ; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    psubd %xmm0, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    psubd %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    psubd %xmm2, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT:    pxor %xmm1, %xmm2
-; SSSE3-NEXT:    movq %xmm3, 16(%rcx)
-; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    psubd %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    movq %xmm1, 16(%rcx)
+; SSSE3-NEXT:    movdqa %xmm0, (%rcx)
 ; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
-; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm5, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ssubo_v6i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movq %rdi, %rax
-; SSE41-NEXT:    movd %esi, %xmm1
-; SSE41-NEXT:    pinsrd $1, %edx, %xmm1
-; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
-; SSE41-NEXT:    pinsrd $3, %r8d, %xmm1
-; SSE41-NEXT:    movd %r9d, %xmm0
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT:    movd %esi, %xmm0
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    movd %r9d, %xmm2
 ; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
 ; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
 ; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
 ; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psubd %xmm3, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    psubd %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm3
-; SSE41-NEXT:    pxor %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psubd %xmm2, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movq %xmm1, 16(%rcx)
-; SSE41-NEXT:    movdqa %xmm4, (%rcx)
-; SSE41-NEXT:    movq %xmm0, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm3, (%rdi)
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    movq %xmm2, 16(%rcx)
+; SSE41-NEXT:    movdqa %xmm0, (%rcx)
+; SSE41-NEXT:    movq %xmm3, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm5, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v6i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm5
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm4, %ymm0
 ; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ssubo_v6i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
@@ -340,9 +348,9 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: ssubo_v6i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
+; AVX512-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
 ; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -362,44 +370,47 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; SSE-LABEL: ssubo_v8i32:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE-NEXT:    psubd %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm2
 ; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    psubd %xmm2, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE-NEXT:    pxor %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    psubd %xmm3, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE-NEXT:    pxor %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
-; SSE-NEXT:    movdqa %xmm5, (%rdi)
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE-NEXT:    psubd %xmm3, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm5, %xmm2
+; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm5
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm0, %ymm4, %ymm0
 ; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ssubo_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
@@ -407,9 +418,9 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: ssubo_v8i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
+; AVX512-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
 ; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -427,57 +438,64 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind {
 ; SSE-LABEL: ssubo_v16i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm9, %xmm9
-; SSE-NEXT:    movdqa %xmm0, %xmm8
-; SSE-NEXT:    psubd %xmm4, %xmm8
-; SSE-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm8, %xmm0
-; SSE-NEXT:    pxor %xmm4, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm4
-; SSE-NEXT:    psubd %xmm5, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm9, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE-NEXT:    pxor %xmm5, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm5
-; SSE-NEXT:    psubd %xmm6, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm9, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE-NEXT:    pxor %xmm6, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm6
-; SSE-NEXT:    psubd %xmm7, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm9, %xmm7
-; SSE-NEXT:    pcmpgtd %xmm6, %xmm3
-; SSE-NEXT:    pxor %xmm7, %xmm3
-; SSE-NEXT:    movdqa %xmm6, 48(%rdi)
-; SSE-NEXT:    movdqa %xmm5, 32(%rdi)
-; SSE-NEXT:    movdqa %xmm4, 16(%rdi)
-; SSE-NEXT:    movdqa %xmm8, (%rdi)
+; SSE-NEXT:    movdqa %xmm4, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm9
+; SSE-NEXT:    psubd %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pxor %xmm8, %xmm8
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm8
+; SSE-NEXT:    pxor %xmm9, %xmm8
+; SSE-NEXT:    movdqa %xmm5, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm9
+; SSE-NEXT:    psubd %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm9, %xmm5
+; SSE-NEXT:    movdqa %xmm6, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm9
+; SSE-NEXT:    psubd %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm9, %xmm6
+; SSE-NEXT:    movdqa %xmm7, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm9
+; SSE-NEXT:    psubd %xmm7, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE-NEXT:    pxor %xmm9, %xmm4
+; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm6, %xmm2
+; SSE-NEXT:    movdqa %xmm4, %xmm3
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v16i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpsubd %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm4, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpsubd %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm7
 ; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm3, %xmm7
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm7
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm5, %xmm1
 ; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm1
 ; AVX1-NEXT:    vpackssdw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm6, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm8
-; AVX1-NEXT:    vpsubd %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm8, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm7, %xmm8
+; AVX1-NEXT:    vpsubd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm5, %xmm7
 ; AVX1-NEXT:    vpxor %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm8
 ; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm8, %xmm0
 ; AVX1-NEXT:    vpackssdw %xmm7, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm5
@@ -497,18 +515,18 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ;
 ; AVX2-LABEL: ssubo_v16i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm3, %ymm5
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm3, %ymm4
 ; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm5, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX2-NEXT:    vpackssdw %xmm5, %xmm4, %xmm4
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpacksswb %xmm4, %xmm4, %xmm1
 ; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
 ; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
@@ -517,9 +535,9 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ;
 ; AVX512-LABEL: ssubo_v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
+; AVX512-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
@@ -761,26 +779,22 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: ssubo_v2i64:
@@ -788,26 +802,22 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
 ; SSSE3-NEXT:    psubq %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm4, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ssubo_v2i64:
@@ -815,33 +825,29 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
 ; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm2, %xmm3
 ; SSE41-NEXT:    psubq %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; SSE41-NEXT:    movdqa %xmm0, (%rdi)
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: ssubo_v2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -850,9 +856,9 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: ssubo_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpgtq %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpgtq %xmm0, %xmm1, %k0
 ; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0



More information about the llvm-commits mailing list