[llvm] [TargetLowering] Change subtraction to do (LHS < RHS) XOR (RESULT < 0) (PR #150872)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 28 09:04:27 PDT 2025
https://github.com/AZero13 updated https://github.com/llvm/llvm-project/pull/150872
>From 791565225a3415fcca0482ec9897c818594c04a5 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Mon, 28 Jul 2025 00:35:42 -0400
Subject: [PATCH] [TargetLowering] Change subtraction to do (LHS < RHS) XOR
(RESULT < 0)
This folds better.
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 31 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 32 +-
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 29 +
llvm/test/CodeGen/AMDGPU/ssubsat.ll | 1010 +++++-----
.../test/CodeGen/RISCV/arith-with-overflow.ll | 4 +-
llvm/test/CodeGen/RISCV/ssub_sat.ll | 14 +-
llvm/test/CodeGen/RISCV/ssub_sat_plus.ll | 18 +-
llvm/test/CodeGen/RISCV/xaluo.ll | 120 +-
llvm/test/CodeGen/RISCV/xqcia.ll | 7 +-
.../CodeGen/Thumb2/mve-saturating-arith.ll | 55 +-
.../CodeGen/X86/expand-vp-int-intrinsics.ll | 79 +-
llvm/test/CodeGen/X86/ssub_sat.ll | 24 +-
llvm/test/CodeGen/X86/ssub_sat_vec.ll | 1743 +++++++----------
llvm/test/CodeGen/X86/vec_ssubo.ll | 542 ++---
14 files changed, 1755 insertions(+), 1953 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed7b07f7d9367..5830016dfd82b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8871,21 +8871,26 @@ LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
MIRBuilder.buildSub(NewDst0, LHS, RHS);
// TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
-
auto Zero = MIRBuilder.buildConstant(Ty, 0);
- // For an addition, the result should be less than one of the operands (LHS)
- // if and only if the other operand (RHS) is negative, otherwise there will
- // be overflow.
- // For a subtraction, the result should be less than one of the operands
- // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
- // otherwise there will be overflow.
- auto ResultLowerThanLHS =
- MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS);
- auto ConditionRHS = MIRBuilder.buildICmp(
- IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
-
- MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
+ if (IsAdd) {
+ // For addition, the result should be less than one of the operands (LHS)
+ // if and only if the other operand (RHS) is negative, otherwise there will
+ // be overflow.
+ auto ResultLowerThanLHS =
+ MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS);
+ auto RHSNegative =
+ MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, RHS, Zero);
+ MIRBuilder.buildXor(Dst1, RHSNegative, ResultLowerThanLHS);
+ } else {
+ // For subtraction, overflow occurs when the signed comparison of operands
+ // doesn't match the sign of the result
+ auto LHSLessThanRHS =
+ MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS, RHS);
+ auto ResultNegative =
+ MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, Zero);
+ MIRBuilder.buildXor(Dst1, LHSLessThanRHS, ResultNegative);
+ }
MIRBuilder.buildCopy(Dst0, NewDst0);
MI.eraseFromParent();
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1764910861df4..7d9c8e3865405 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11441,19 +11441,25 @@ void TargetLowering::expandSADDSUBO(
SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
- // For an addition, the result should be less than one of the operands (LHS)
- // if and only if the other operand (RHS) is negative, otherwise there will
- // be overflow.
- // For a subtraction, the result should be less than one of the operands
- // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
- // otherwise there will be overflow.
- SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
- SDValue ConditionRHS =
- DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT);
-
- Overflow = DAG.getBoolExtOrTrunc(
- DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
- ResultType, ResultType);
+ if (IsAdd) {
+ // For addition, the result should be less than one of the operands (LHS)
+ // if and only if the other operand (RHS) is negative, otherwise there will
+ // be overflow.
+ SDValue ResultLowerThanLHS =
+ DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
+ SDValue ConditionRHS = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETLT);
+ Overflow = DAG.getBoolExtOrTrunc(
+ DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
+ ResultType, ResultType);
+ } else {
+ // For subtraction, overflow occurs when the signed comparison of operands
+ // doesn't match the sign of the result
+ SDValue LHSLessThanRHS = DAG.getSetCC(dl, OType, LHS, RHS, ISD::SETLT);
+ SDValue ResultNegative = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETLT);
+ Overflow = DAG.getBoolExtOrTrunc(
+ DAG.getNode(ISD::XOR, dl, OType, LHSLessThanRHS, ResultNegative), dl,
+ ResultType, ResultType);
+ }
}
bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 607edd3d859f8..88a92094f4772 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14497,6 +14497,35 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(customLegalizeToWOp(N, DAG, ExtOpc));
break;
}
+ case ISD::SSUBO: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+
+ // If the RHS is a constant, we can simplify ConditionRHS below. Otherwise
+ // use the default legalization.
+ if (!isa<ConstantSDNode>(N->getOperand(1)))
+ return;
+
+ SDValue LHS = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(0));
+ SDValue RHS = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MVT::i64, LHS, RHS);
+ Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Res,
+ DAG.getValueType(MVT::i32));
+
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+
+ // For subtraction, overflow occurs when the signed comparison of operands
+ // doesn't match the sign of the result
+ EVT OType = N->getValueType(1);
+ SDValue LHSLessThanRHS = DAG.getSetCC(DL, OType, LHS, RHS, ISD::SETLT);
+ SDValue ResultNegative = DAG.getSetCC(DL, OType, Res, Zero, ISD::SETLT);
+ SDValue Overflow =
+ DAG.getNode(ISD::XOR, DL, OType, LHSLessThanRHS, ResultNegative);
+
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ Results.push_back(Overflow);
+ return;
+ }
case ISD::SADDO: {
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 40d80f5e83e36..24ad6af504b7b 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -80,13 +80,13 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
; GFX8-LABEL: v_ssubsat_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1
-; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1
-; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v0, v1
+; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v0
+; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_i16:
@@ -120,25 +120,25 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
; GFX6-LABEL: v_ssubsat_i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
-; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v0, v1
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1
+; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v1
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
-; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v0, v1
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1
+; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_i32:
@@ -181,21 +181,21 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_sub_u16_e32 v4, v3, v2
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2
-; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4
-; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1
-; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1
-; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v3, v2
+; GFX8-NEXT: v_sub_u16_e32 v2, v3, v2
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3
+; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v0, v1
+; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v0
+; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -244,27 +244,27 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4
-; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6
-; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4
-; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3
-; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3
-; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT: v_sub_u16_e32 v4, v5, v4
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v4
+; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2
-; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2
-; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v1, v3
+; GFX8-NEXT: v_sub_u16_e32 v1, v1, v3
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v1
+; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v1
+; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v0, v2
+; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v0
+; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -321,39 +321,39 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4
-; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6
-; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4
-; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2
-; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2
-; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT: v_sub_u16_e32 v4, v5, v4
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v4
+; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5
+; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v0, v2
+; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v0
+; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_sub_u16_e32 v5, v4, v2
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2
-; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5
-; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3
-; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3
-; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3
-; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v2
+; GFX8-NEXT: v_sub_u16_e32 v2, v4, v2
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v2
+; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4
+; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v1, v3
+; GFX8-NEXT: v_sub_u16_e32 v1, v1, v3
+; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v1
+; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v1
+; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -379,39 +379,39 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; GFX6-LABEL: v_ssubsat_v2i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2
-; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2
+; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v2
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v0
+; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v3
+; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2
-; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2
+; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v0
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v3
+; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v2i32:
@@ -435,53 +435,53 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
; GFX6-LABEL: v_ssubsat_v3i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3
-; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v3
+; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v3
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0
+; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4
+; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2
-; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3
-; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v5
+; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v5
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v3i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3
-; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0
-; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v3
+; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v0
+; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4
+; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2
-; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v5
+; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v3i32:
@@ -507,67 +507,67 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
; GFX6-LABEL: v_ssubsat_v4i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4
-; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v4
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v4
+; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v4
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v0
+; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v5
+; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2
-; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4
-; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v6
+; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3
-; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4
-; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v3, v7
+; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v4i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4
-; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0
-; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v4
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v4
+; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v0
+; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v5
+; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2
-; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v6
+; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4
-; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v7
+; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v4i32:
@@ -595,123 +595,123 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
; GFX6-LABEL: v_ssubsat_v8i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8
-; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v8
+; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v8
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v0
+; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v9
+; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v9
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2
-; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v10
+; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v10
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v2
+; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v3, v11
+; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v11
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v3
+; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
-; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4
-; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v4, v12
+; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v12
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v4
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5
-; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
+; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v5, v13
+; GFX6-NEXT: v_sub_i32_e64 v5, s[4:5], v5, v13
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v5
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v5
+; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6
-; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
+; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v6, v14
+; GFX6-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v14
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v6
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v6
+; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7
-; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
+; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v7, v15
+; GFX6-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v15
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
+; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v8i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8
-; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0
-; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v8
+; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v8
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v0
+; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v9
+; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v9
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2
-; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v10
+; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v10
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v2
+; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v11
+; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v11
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v3
+; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
-; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v4, v12
+; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v12
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v4
+; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v4
+; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5
-; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v5, v13
+; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v13
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v5
+; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v5
+; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6
-; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v6, v14
+; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v14
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v6
+; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v6
+; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
-; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7
-; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v7, v15
+; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v15
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
+; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v8i32:
@@ -747,239 +747,239 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX6-LABEL: v_ssubsat_v16i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16
-; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v16
+; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v0
+; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
-; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v17
+; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v17
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v1
+; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc
-; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2
-; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v18
+; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v2
+; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
-; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v3, v19
+; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v19
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v3
+; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v4, v20
+; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v4
+; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v4
+; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc
; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5
-; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v5, v21
+; GFX6-NEXT: v_sub_i32_e64 v5, s[4:5], v5, v21
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v5
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v5
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6
-; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
+; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v6, v22
+; GFX6-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v22
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v6
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v6
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7
-; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
+; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v17, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v7, v23
+; GFX6-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v23
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v7
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8
-; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
+; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v8, v24
+; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v24
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v8
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v8
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9
-; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v9, 0x80000000, v9
+; GFX6-NEXT: v_cndmask_b32_e32 v8, v8, v17, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v9, v25
+; GFX6-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v25
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v9
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v9
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10
-; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v10, 0x80000000, v10
+; GFX6-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v10, v26
+; GFX6-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v26
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v10
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v10
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11
-; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v11, 0x80000000, v11
+; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v11, v27
+; GFX6-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v27
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v11
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v11
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v12, v28
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12
-; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v12, 0x80000000, v12
+; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v12, v28
+; GFX6-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v28
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v12
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v12
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v13, v29
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13
-; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v13, 0x80000000, v13
+; GFX6-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v13, v29
+; GFX6-NEXT: v_sub_i32_e64 v13, s[4:5], v13, v29
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v13
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v13
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc
-; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v14, v30
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14
-; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v14, 0x80000000, v14
+; GFX6-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v14, v30
+; GFX6-NEXT: v_sub_i32_e64 v14, s[4:5], v14, v30
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v14
+; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v14
+; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16
-; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v16
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15
-; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v15, 0x80000000, v15
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v15, v16
+; GFX6-NEXT: v_sub_i32_e64 v15, s[4:5], v15, v16
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15
+; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v16i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16
-; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v16
+; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v0
+; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
-; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v17
+; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v1
+; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2
-; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v18
+; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v2
+; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
-; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v19
+; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
+; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v3
+; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v4, v20
+; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v4
+; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v4
+; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5
-; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v5, v21
+; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v5
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v5
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6
-; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v6, v22
+; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v6
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v6
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7
-; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v17, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v7, v23
+; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v7
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8
-; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v8, v24
+; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v8
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v8
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9
-; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v9, 0x80000000, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v17, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v9, v25
+; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v9
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v9
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10
-; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v10, 0x80000000, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v10, v26
+; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v10
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v10
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11
-; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v11, 0x80000000, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v11, v27
+; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v11
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v11
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v12, v28
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12
-; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v12, 0x80000000, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v12, v28
+; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v12
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v12
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v13, v29
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13
-; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v13, 0x80000000, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v13, v29
+; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v13
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v13
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc
-; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v14, v30
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14
-; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v14, 0x80000000, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v14, v30
+; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v14
+; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v14
+; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16
-; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v16
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15
-; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v15, 0x80000000, v15
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v15, v16
+; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15
+; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v16i32:
@@ -1059,43 +1059,43 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-LABEL: v_ssubsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
-; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
-; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5
-; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v2
+; GFX6-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v3, s[4:5]
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[0:1]
+; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5
-; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2
+; GFX8-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v3, s[4:5]
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[0:1]
+; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5
-; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_sub_co_u32_e64 v0, s[4:5], v0, v2
+; GFX9-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v3, s[4:5]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[0:1]
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX9-NEXT: s_xor_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_ssubsat_i64:
@@ -1103,11 +1103,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3]
+; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5]
; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
-; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -1117,11 +1117,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3]
+; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5]
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
-; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
index 4efc224ab1ca7..f8a9c6ae0fbf4 100644
--- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
@@ -27,9 +27,9 @@ entry:
define i1 @ssub(i32 %a, i32 %b, ptr %c) nounwind {
; RV32I-LABEL: ssub:
; RV32I: # %bb.0: # %entry
-; RV32I-NEXT: sgtz a3, a1
+; RV32I-NEXT: slt a3, a0, a1
; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: slt a0, a1, a0
+; RV32I-NEXT: slti a0, a1, 0
; RV32I-NEXT: xor a0, a3, a0
; RV32I-NEXT: sw a1, 0(a2)
; RV32I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/ssub_sat.ll b/llvm/test/CodeGen/RISCV/ssub_sat.ll
index ba4d170c719fc..5bffbbf13d09f 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat.ll
@@ -13,11 +13,10 @@ declare i64 @llvm.ssub.sat.i64(i64, i64)
define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
; RV32-LABEL: func:
; RV32: # %bb.0:
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: sgtz a3, a1
+; RV32-NEXT: slt a2, a0, a1
; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: slt a1, a0, a2
-; RV32-NEXT: beq a3, a1, .LBB0_2
+; RV32-NEXT: slti a1, a0, 0
+; RV32-NEXT: beq a2, a1, .LBB0_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: srai a0, a0, 31
; RV32-NEXT: lui a1, 524288
@@ -73,11 +72,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
;
; RV64-LABEL: func2:
; RV64: # %bb.0:
-; RV64-NEXT: mv a2, a0
-; RV64-NEXT: sgtz a3, a1
+; RV64-NEXT: slt a2, a0, a1
; RV64-NEXT: sub a0, a0, a1
-; RV64-NEXT: slt a1, a0, a2
-; RV64-NEXT: beq a3, a1, .LBB1_2
+; RV64-NEXT: slti a1, a0, 0
+; RV64-NEXT: beq a2, a1, .LBB1_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: srai a0, a0, 63
; RV64-NEXT: li a1, -1
diff --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
index 437c1e2a2e489..78cc2cb1eb4cf 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
@@ -13,12 +13,11 @@ declare i64 @llvm.ssub.sat.i64(i64, i64)
define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
; RV32-LABEL: func32:
; RV32: # %bb.0:
-; RV32-NEXT: mv a3, a0
-; RV32-NEXT: mul a0, a1, a2
-; RV32-NEXT: sgtz a1, a0
-; RV32-NEXT: sub a0, a3, a0
-; RV32-NEXT: slt a2, a0, a3
-; RV32-NEXT: beq a1, a2, .LBB0_2
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slt a2, a0, a1
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: slti a1, a0, 0
+; RV32-NEXT: beq a2, a1, .LBB0_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: srai a0, a0, 31
; RV32-NEXT: lui a1, 524288
@@ -77,11 +76,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
;
; RV64-LABEL: func64:
; RV64: # %bb.0:
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: sgtz a3, a2
+; RV64-NEXT: slt a1, a0, a2
; RV64-NEXT: sub a0, a0, a2
-; RV64-NEXT: slt a1, a0, a1
-; RV64-NEXT: beq a3, a1, .LBB1_2
+; RV64-NEXT: slti a2, a0, 0
+; RV64-NEXT: beq a1, a2, .LBB1_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: srai a0, a0, 63
; RV64-NEXT: li a1, -1
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index a30593d7d7afb..699e791fccf48 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -748,9 +748,9 @@ entry:
define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
; RV32-LABEL: ssubo1.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sgtz a3, a1
+; RV32-NEXT: slt a3, a0, a1
; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: slt a0, a1, a0
+; RV32-NEXT: slti a0, a1, 0
; RV32-NEXT: xor a0, a3, a0
; RV32-NEXT: sw a1, 0(a2)
; RV32-NEXT: ret
@@ -766,9 +766,9 @@ define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
;
; RV32ZBA-LABEL: ssubo1.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sgtz a3, a1
+; RV32ZBA-NEXT: slt a3, a0, a1
; RV32ZBA-NEXT: sub a1, a0, a1
-; RV32ZBA-NEXT: slt a0, a1, a0
+; RV32ZBA-NEXT: slti a0, a1, 0
; RV32ZBA-NEXT: xor a0, a3, a0
; RV32ZBA-NEXT: sw a1, 0(a2)
; RV32ZBA-NEXT: ret
@@ -784,9 +784,9 @@ define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
;
; RV32ZICOND-LABEL: ssubo1.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sgtz a3, a1
+; RV32ZICOND-NEXT: slt a3, a0, a1
; RV32ZICOND-NEXT: sub a1, a0, a1
-; RV32ZICOND-NEXT: slt a0, a1, a0
+; RV32ZICOND-NEXT: slti a0, a1, 0
; RV32ZICOND-NEXT: xor a0, a3, a0
; RV32ZICOND-NEXT: sw a1, 0(a2)
; RV32ZICOND-NEXT: ret
@@ -874,9 +874,9 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64-LABEL: ssubo.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sgtz a3, a1
+; RV64-NEXT: slt a3, a0, a1
; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: slt a0, a1, a0
+; RV64-NEXT: slti a0, a1, 0
; RV64-NEXT: xor a0, a3, a0
; RV64-NEXT: sd a1, 0(a2)
; RV64-NEXT: ret
@@ -897,9 +897,9 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64ZBA-LABEL: ssubo.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sgtz a3, a1
+; RV64ZBA-NEXT: slt a3, a0, a1
; RV64ZBA-NEXT: sub a1, a0, a1
-; RV64ZBA-NEXT: slt a0, a1, a0
+; RV64ZBA-NEXT: slti a0, a1, 0
; RV64ZBA-NEXT: xor a0, a3, a0
; RV64ZBA-NEXT: sd a1, 0(a2)
; RV64ZBA-NEXT: ret
@@ -920,9 +920,9 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64ZICOND-LABEL: ssubo.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sgtz a3, a1
+; RV64ZICOND-NEXT: slt a3, a0, a1
; RV64ZICOND-NEXT: sub a1, a0, a1
-; RV64ZICOND-NEXT: slt a0, a1, a0
+; RV64ZICOND-NEXT: slti a0, a1, 0
; RV64ZICOND-NEXT: xor a0, a3, a0
; RV64ZICOND-NEXT: sd a1, 0(a2)
; RV64ZICOND-NEXT: ret
@@ -2527,9 +2527,9 @@ entry:
define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: ssubo.select.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sgtz a2, a1
+; RV32-NEXT: slt a2, a0, a1
; RV32-NEXT: sub a3, a0, a1
-; RV32-NEXT: slt a3, a3, a0
+; RV32-NEXT: slti a3, a3, 0
; RV32-NEXT: bne a2, a3, .LBB36_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: mv a0, a1
@@ -2548,9 +2548,9 @@ define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: ssubo.select.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sgtz a2, a1
+; RV32ZBA-NEXT: slt a2, a0, a1
; RV32ZBA-NEXT: sub a3, a0, a1
-; RV32ZBA-NEXT: slt a3, a3, a0
+; RV32ZBA-NEXT: slti a3, a3, 0
; RV32ZBA-NEXT: bne a2, a3, .LBB36_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: mv a0, a1
@@ -2569,9 +2569,9 @@ define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: ssubo.select.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sgtz a2, a1
+; RV32ZICOND-NEXT: slt a2, a0, a1
; RV32ZICOND-NEXT: sub a3, a0, a1
-; RV32ZICOND-NEXT: slt a3, a3, a0
+; RV32ZICOND-NEXT: slti a3, a3, 0
; RV32ZICOND-NEXT: xor a2, a2, a3
; RV32ZICOND-NEXT: czero.nez a1, a1, a2
; RV32ZICOND-NEXT: czero.eqz a0, a0, a2
@@ -2597,9 +2597,9 @@ entry:
define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: ssubo.not.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sgtz a2, a1
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: slt a0, a1, a0
+; RV32-NEXT: slt a2, a0, a1
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: slti a0, a0, 0
; RV32-NEXT: xor a0, a2, a0
; RV32-NEXT: xori a0, a0, 1
; RV32-NEXT: ret
@@ -2614,9 +2614,9 @@ define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: ssubo.not.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sgtz a2, a1
-; RV32ZBA-NEXT: sub a1, a0, a1
-; RV32ZBA-NEXT: slt a0, a1, a0
+; RV32ZBA-NEXT: slt a2, a0, a1
+; RV32ZBA-NEXT: sub a0, a0, a1
+; RV32ZBA-NEXT: slti a0, a0, 0
; RV32ZBA-NEXT: xor a0, a2, a0
; RV32ZBA-NEXT: xori a0, a0, 1
; RV32ZBA-NEXT: ret
@@ -2631,9 +2631,9 @@ define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: ssubo.not.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sgtz a2, a1
-; RV32ZICOND-NEXT: sub a1, a0, a1
-; RV32ZICOND-NEXT: slt a0, a1, a0
+; RV32ZICOND-NEXT: slt a2, a0, a1
+; RV32ZICOND-NEXT: sub a0, a0, a1
+; RV32ZICOND-NEXT: slti a0, a0, 0
; RV32ZICOND-NEXT: xor a0, a2, a0
; RV32ZICOND-NEXT: xori a0, a0, 1
; RV32ZICOND-NEXT: ret
@@ -2670,9 +2670,9 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: ssubo.select.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sgtz a2, a1
+; RV64-NEXT: slt a2, a0, a1
; RV64-NEXT: sub a3, a0, a1
-; RV64-NEXT: slt a3, a3, a0
+; RV64-NEXT: slti a3, a3, 0
; RV64-NEXT: bne a2, a3, .LBB38_2
; RV64-NEXT: # %bb.1: # %entry
; RV64-NEXT: mv a0, a1
@@ -2696,9 +2696,9 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: ssubo.select.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sgtz a2, a1
+; RV64ZBA-NEXT: slt a2, a0, a1
; RV64ZBA-NEXT: sub a3, a0, a1
-; RV64ZBA-NEXT: slt a3, a3, a0
+; RV64ZBA-NEXT: slti a3, a3, 0
; RV64ZBA-NEXT: bne a2, a3, .LBB38_2
; RV64ZBA-NEXT: # %bb.1: # %entry
; RV64ZBA-NEXT: mv a0, a1
@@ -2724,9 +2724,9 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: ssubo.select.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sgtz a2, a1
+; RV64ZICOND-NEXT: slt a2, a0, a1
; RV64ZICOND-NEXT: sub a3, a0, a1
-; RV64ZICOND-NEXT: slt a3, a3, a0
+; RV64ZICOND-NEXT: slti a3, a3, 0
; RV64ZICOND-NEXT: xor a2, a2, a3
; RV64ZICOND-NEXT: czero.nez a1, a1, a2
; RV64ZICOND-NEXT: czero.eqz a0, a0, a2
@@ -2754,9 +2754,9 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: ssub.not.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sgtz a2, a1
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: slt a0, a1, a0
+; RV64-NEXT: slt a2, a0, a1
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: slti a0, a0, 0
; RV64-NEXT: xor a0, a2, a0
; RV64-NEXT: xori a0, a0, 1
; RV64-NEXT: ret
@@ -2775,9 +2775,9 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: ssub.not.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sgtz a2, a1
-; RV64ZBA-NEXT: sub a1, a0, a1
-; RV64ZBA-NEXT: slt a0, a1, a0
+; RV64ZBA-NEXT: slt a2, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
+; RV64ZBA-NEXT: slti a0, a0, 0
; RV64ZBA-NEXT: xor a0, a2, a0
; RV64ZBA-NEXT: xori a0, a0, 1
; RV64ZBA-NEXT: ret
@@ -2796,9 +2796,9 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: ssub.not.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sgtz a2, a1
-; RV64ZICOND-NEXT: sub a1, a0, a1
-; RV64ZICOND-NEXT: slt a0, a1, a0
+; RV64ZICOND-NEXT: slt a2, a0, a1
+; RV64ZICOND-NEXT: sub a0, a0, a1
+; RV64ZICOND-NEXT: slti a0, a0, 0
; RV64ZICOND-NEXT: xor a0, a2, a0
; RV64ZICOND-NEXT: xori a0, a0, 1
; RV64ZICOND-NEXT: ret
@@ -4196,9 +4196,9 @@ continue:
define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: ssubo.br.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sgtz a2, a1
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: slt a0, a1, a0
+; RV32-NEXT: slt a2, a0, a1
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: slti a0, a0, 0
; RV32-NEXT: beq a2, a0, .LBB56_2
; RV32-NEXT: # %bb.1: # %overflow
; RV32-NEXT: li a0, 0
@@ -4221,9 +4221,9 @@ define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: ssubo.br.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sgtz a2, a1
-; RV32ZBA-NEXT: sub a1, a0, a1
-; RV32ZBA-NEXT: slt a0, a1, a0
+; RV32ZBA-NEXT: slt a2, a0, a1
+; RV32ZBA-NEXT: sub a0, a0, a1
+; RV32ZBA-NEXT: slti a0, a0, 0
; RV32ZBA-NEXT: beq a2, a0, .LBB56_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
; RV32ZBA-NEXT: li a0, 0
@@ -4246,9 +4246,9 @@ define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: ssubo.br.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sgtz a2, a1
-; RV32ZICOND-NEXT: sub a1, a0, a1
-; RV32ZICOND-NEXT: slt a0, a1, a0
+; RV32ZICOND-NEXT: slt a2, a0, a1
+; RV32ZICOND-NEXT: sub a0, a0, a1
+; RV32ZICOND-NEXT: slti a0, a0, 0
; RV32ZICOND-NEXT: beq a2, a0, .LBB56_2
; RV32ZICOND-NEXT: # %bb.1: # %overflow
; RV32ZICOND-NEXT: li a0, 0
@@ -4300,9 +4300,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: ssubo.br.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sgtz a2, a1
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: slt a0, a1, a0
+; RV64-NEXT: slt a2, a0, a1
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: slti a0, a0, 0
; RV64-NEXT: beq a2, a0, .LBB57_2
; RV64-NEXT: # %bb.1: # %overflow
; RV64-NEXT: li a0, 0
@@ -4329,9 +4329,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: ssubo.br.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sgtz a2, a1
-; RV64ZBA-NEXT: sub a1, a0, a1
-; RV64ZBA-NEXT: slt a0, a1, a0
+; RV64ZBA-NEXT: slt a2, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
+; RV64ZBA-NEXT: slti a0, a0, 0
; RV64ZBA-NEXT: beq a2, a0, .LBB57_2
; RV64ZBA-NEXT: # %bb.1: # %overflow
; RV64ZBA-NEXT: li a0, 0
@@ -4358,9 +4358,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: ssubo.br.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sgtz a2, a1
-; RV64ZICOND-NEXT: sub a1, a0, a1
-; RV64ZICOND-NEXT: slt a0, a1, a0
+; RV64ZICOND-NEXT: slt a2, a0, a1
+; RV64ZICOND-NEXT: sub a0, a0, a1
+; RV64ZICOND-NEXT: slti a0, a0, 0
; RV64ZICOND-NEXT: beq a2, a0, .LBB57_2
; RV64ZICOND-NEXT: # %bb.1: # %overflow
; RV64ZICOND-NEXT: li a0, 0
diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll
index c75bb9daefcf2..29beb221797f2 100644
--- a/llvm/test/CodeGen/RISCV/xqcia.ll
+++ b/llvm/test/CodeGen/RISCV/xqcia.ll
@@ -48,11 +48,10 @@ define i32 @addusat(i32 %a, i32 %b) {
define i32 @subsat(i32 %a, i32 %b) {
; RV32I-LABEL: subsat:
; RV32I: # %bb.0:
-; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: sgtz a3, a1
+; RV32I-NEXT: slt a2, a0, a1
; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: slt a1, a0, a2
-; RV32I-NEXT: beq a3, a1, .LBB2_2
+; RV32I-NEXT: slti a1, a0, 0
+; RV32I-NEXT: beq a2, a1, .LBB2_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: lui a1, 524288
diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index bbc0ff9bd1be5..d566d0ddce6ba 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -179,47 +179,40 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
; CHECK-LABEL: ssub_int64_t:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: vmov r2, r3, d2
-; CHECK-NEXT: vmov r1, r0, d0
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: vmov r0, r1, d2
+; CHECK-NEXT: mov.w r12, #1
+; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: vmov r4, r5, d1
-; CHECK-NEXT: subs.w r12, r1, r2
-; CHECK-NEXT: sbc.w lr, r0, r3
-; CHECK-NEXT: subs.w r1, r12, r1
-; CHECK-NEXT: sbcs.w r0, lr, r0
-; CHECK-NEXT: mov.w r1, #0
-; CHECK-NEXT: cset r0, lt
-; CHECK-NEXT: rsbs r2, r2, #0
-; CHECK-NEXT: sbcs.w r2, r1, r3
+; CHECK-NEXT: subs.w lr, r2, r0
+; CHECK-NEXT: sbcs.w r1, r3, r1
+; CHECK-NEXT: mov.w r3, #0
+; CHECK-NEXT: lsr.w r2, r1, #31
; CHECK-NEXT: it lt
-; CHECK-NEXT: eorlt r0, r0, #1
-; CHECK-NEXT: vmov r2, r3, d3
-; CHECK-NEXT: rsbs r0, r0, #0
-; CHECK-NEXT: subs r6, r4, r2
-; CHECK-NEXT: sbc.w r7, r5, r3
-; CHECK-NEXT: subs r4, r6, r4
-; CHECK-NEXT: sbcs.w r4, r7, r5
-; CHECK-NEXT: vmov q0[2], q0[0], r12, r6
-; CHECK-NEXT: cset r4, lt
+; CHECK-NEXT: eorlt.w r2, r12, r1, lsr #31
; CHECK-NEXT: rsbs r2, r2, #0
-; CHECK-NEXT: sbcs.w r2, r1, r3
-; CHECK-NEXT: bfi r1, r0, #0, #8
+; CHECK-NEXT: bfi r3, r2, #0, #8
+; CHECK-NEXT: vmov r2, r0, d3
+; CHECK-NEXT: subs r2, r4, r2
+; CHECK-NEXT: sbcs.w r0, r5, r0
+; CHECK-NEXT: vmov q0[2], q0[0], lr, r2
+; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT: asr.w r1, r1, #31
+; CHECK-NEXT: lsr.w r4, r0, #31
; CHECK-NEXT: it lt
-; CHECK-NEXT: eorlt r4, r4, #1
-; CHECK-NEXT: rsbs r0, r4, #0
-; CHECK-NEXT: bfi r1, r0, #8, #8
-; CHECK-NEXT: asrs r0, r7, #31
-; CHECK-NEXT: vmsr p0, r1
-; CHECK-NEXT: asr.w r1, lr, #31
+; CHECK-NEXT: eorlt.w r4, r12, r0, lsr #31
+; CHECK-NEXT: asrs r0, r0, #31
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT: vmov q0[3], q0[1], lr, r7
+; CHECK-NEXT: rsbs r5, r4, #0
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: adr r0, .LCPI11_0
; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: bfi r3, r5, #8, #8
+; CHECK-NEXT: vmsr p0, r3
; CHECK-NEXT: veor q1, q1, q2
; CHECK-NEXT: vpsel q0, q1, q0
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI11_0:
diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
index dbfa69d497698..09c7c4b7a26f6 100644
--- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
@@ -1821,67 +1821,60 @@ declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
define <4 x i32> @vp_ssub_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
; X86-LABEL: vp_ssub_sat_v4i32:
; X86: # %bb.0:
-; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X86-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; X86-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; X86-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; X86-NEXT: vpsrad $31, %xmm1, %xmm2
-; X86-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2
-; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; X86-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; X86-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; X86-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; X86-NEXT: vpsrad $31, %xmm2, %xmm1
+; X86-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: retl
;
; SSE-LABEL: vp_ssub_sat_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psubd %xmm1, %xmm3
-; SSE-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm3, %xmm1
-; SSE-NEXT: psrad $31, %xmm3
-; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm0, %xmm2
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: vp_ssub_sat_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: vp_ssub_sat_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: vp_ssub_sat_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k0
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
-; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
-; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
-; AVX512-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1}
; AVX512-NEXT: retq
%v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
ret <4 x i32> %v
diff --git a/llvm/test/CodeGen/X86/ssub_sat.ll b/llvm/test/CodeGen/X86/ssub_sat.ll
index 8ecc8b39ac468..ee6b60c075630 100644
--- a/llvm/test/CodeGen/X86/ssub_sat.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat.ll
@@ -207,18 +207,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
;
; X64-LABEL: vec:
; X64: # %bb.0:
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: movdqa %xmm0, %xmm3
-; X64-NEXT: psubd %xmm1, %xmm3
-; X64-NEXT: pcmpgtd %xmm2, %xmm1
-; X64-NEXT: pcmpgtd %xmm3, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm0
-; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: pandn %xmm3, %xmm1
-; X64-NEXT: psrad $31, %xmm3
-; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-NEXT: pand %xmm3, %xmm0
-; X64-NEXT: por %xmm1, %xmm0
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-NEXT: psubd %xmm1, %xmm0
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-NEXT: pxor %xmm2, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: pandn %xmm0, %xmm2
+; X64-NEXT: psrad $31, %xmm0
+; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: por %xmm2, %xmm0
; X64-NEXT: retq
%tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %tmp
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index eb2ad4fdff92f..bdac954031fb5 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -612,99 +612,91 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psubd %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: psubd %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm3, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v2i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrad $31, %xmm2, %xmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
-; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k0
+; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k1
-; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm0
-; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1}
; AVX512BW-NEXT: retq
%z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
@@ -713,99 +705,91 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-LABEL: v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psubd %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: psubd %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm3, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrad $31, %xmm2, %xmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
-; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k0
+; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k1
-; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm0
-; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1}
; AVX512BW-NEXT: retq
%z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %z
@@ -814,145 +798,136 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm5, %xmm2
-; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm0
; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubd %xmm3, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: psubd %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pxor %xmm6, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: psubd %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pandn %xmm5, %xmm2
-; SSSE3-NEXT: psrad $31, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: psubd %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm4
+; SSSE3-NEXT: pandn %xmm0, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: pxor %xmm6, %xmm0
; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: psubd %xmm3, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pandn %xmm2, %xmm3
-; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm2
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT: psubd %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm1, %xmm3
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pxor %xmm6, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm1
; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: psubd %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movdqa %xmm5, %xmm1
-; SSE41-NEXT: psubd %xmm3, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: psrad $31, %xmm5
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm6, %xmm5
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psubd %xmm3, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pxor %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: movaps %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0
+; AVX1-NEXT: vpsrad $31, %xmm5, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm4, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm3, %ymm0
+; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm6, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm2, %ymm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsrad $31, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrad $31, %ymm2, %ymm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v8i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpgtd %ymm2, %ymm1, %k0
-; AVX512BW-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512BW-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
+; AVX512BW-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k1
-; AVX512BW-NEXT: vpsrad $31, %ymm1, %ymm0
-; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpsrad $31, %ymm0, %ymm1
+; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 {%k1}
; AVX512BW-NEXT: retq
%z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
ret <8 x i32> %z
@@ -961,223 +936,215 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; SSE2-LABEL: v16i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm9
-; SSE2-NEXT: psubd %xmm4, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: pandn %xmm9, %xmm10
-; SSE2-NEXT: psrad $31, %xmm9
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm4, %xmm9
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE2-NEXT: psubd %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pandn %xmm0, %xmm10
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm8, %xmm0
; SSE2-NEXT: pand %xmm9, %xmm0
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: psubd %xmm5, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm9, %xmm5
-; SSE2-NEXT: psrad $31, %xmm9
-; SSE2-NEXT: pxor %xmm4, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: psubd %xmm6, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pandn %xmm5, %xmm6
-; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: psubd %xmm7, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT: pxor %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pandn %xmm5, %xmm6
-; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm9
+; SSE2-NEXT: psubd %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pandn %xmm1, %xmm9
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm9, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: psubd %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm5
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: psubd %xmm7, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v16i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm8, %xmm8
-; SSSE3-NEXT: movdqa %xmm0, %xmm9
-; SSSE3-NEXT: psubd %xmm4, %xmm9
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm10
-; SSSE3-NEXT: pandn %xmm9, %xmm10
-; SSSE3-NEXT: psrad $31, %xmm9
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: pxor %xmm4, %xmm9
+; SSSE3-NEXT: movdqa %xmm4, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm8
+; SSSE3-NEXT: psubd %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm9
+; SSSE3-NEXT: pxor %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm9, %xmm10
+; SSSE3-NEXT: pandn %xmm0, %xmm10
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm8, %xmm0
; SSSE3-NEXT: pand %xmm9, %xmm0
; SSSE3-NEXT: por %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm9
-; SSSE3-NEXT: psubd %xmm5, %xmm9
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pxor %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm9, %xmm5
-; SSSE3-NEXT: psrad $31, %xmm9
-; SSSE3-NEXT: pxor %xmm4, %xmm9
-; SSSE3-NEXT: pand %xmm9, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: psubd %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: pandn %xmm5, %xmm6
-; SSSE3-NEXT: psrad $31, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: psubd %xmm7, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3
-; SSSE3-NEXT: pxor %xmm7, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSSE3-NEXT: pandn %xmm5, %xmm6
-; SSSE3-NEXT: psrad $31, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm9
+; SSSE3-NEXT: psubd %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: pandn %xmm1, %xmm9
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pxor %xmm8, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm9, %xmm1
+; SSSE3-NEXT: movdqa %xmm6, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT: psubd %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm5
+; SSSE3-NEXT: pandn %xmm2, %xmm5
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pxor %xmm8, %xmm2
+; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm7, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT: psubd %xmm7, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: pxor %xmm8, %xmm3
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm5, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v16i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm3, %xmm11
-; SSE41-NEXT: movdqa %xmm2, %xmm10
+; SSE41-NEXT: movdqa %xmm4, %xmm8
; SSE41-NEXT: movdqa %xmm1, %xmm9
-; SSE41-NEXT: pxor %xmm12, %xmm12
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: psubd %xmm4, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm12, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: movdqa %xmm8, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psubd %xmm8, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE41-NEXT: pxor %xmm4, %xmm8
+; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm8
+; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm10, %xmm1
+; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm4
; SSE41-NEXT: movdqa %xmm9, %xmm1
; SSE41-NEXT: psubd %xmm5, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm12, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm9
-; SSE41-NEXT: pxor %xmm5, %xmm9
-; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
+; SSE41-NEXT: pxor %xmm1, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: psrad $31, %xmm8
+; SSE41-NEXT: pxor %xmm10, %xmm8
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: psubd %xmm6, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: movdqa %xmm5, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm10, %xmm2
-; SSE41-NEXT: psubd %xmm6, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm10
-; SSE41-NEXT: pxor %xmm6, %xmm10
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm11, %xmm3
-; SSE41-NEXT: psubd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm12, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm11
-; SSE41-NEXT: pxor %xmm7, %xmm11
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: psrad $31, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm11, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3
-; SSE41-NEXT: movaps %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm10, %xmm2
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm5
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: psubd %xmm7, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm7
+; SSE41-NEXT: pxor %xmm6, %xmm7
+; SSE41-NEXT: movdqa %xmm6, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pxor %xmm10, %xmm2
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm6
+; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: movaps %xmm5, %xmm2
+; SSE41-NEXT: movaps %xmm6, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: v16i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm6, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm8
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT: vpsrad $31, %xmm7, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm6, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0
+; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm8, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm6, %xmm6
-; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm5, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm6
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm8
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm8, %ymm1
+; AVX1-NEXT: vpsrad $31, %xmm7, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm6, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm5, %ymm1
+; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm8, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpsrad $31, %ymm2, %ymm5
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vblendvps %ymm0, %ymm5, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm2
-; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrad $31, %ymm3, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1
+; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm4
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm4, %ymm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm4, %ymm0
+; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm2, %ymm3
+; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vblendvps %ymm1, %ymm3, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v16i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k0
-; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
+; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
-; AVX512-NEXT: vpsrad $31, %zmm1, %zmm0
-; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: vpsrad $31, %zmm0, %zmm1
+; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
%z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
ret <16 x i32> %z
@@ -1189,32 +1156,24 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -1224,32 +1183,24 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: psubq %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: psubq %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: pandn %xmm0, %xmm1
; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
@@ -1260,23 +1211,15 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: por %xmm0, %xmm3
; SSE41-NEXT: psubq %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pxor %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm4
-; SSE41-NEXT: por %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: por %xmm1, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pxor %xmm2, %xmm3
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1287,55 +1230,47 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
;
; AVX1-LABEL: v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: # xmm2 = mem[0,0]
-; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: # xmm1 = mem[0,0]
+; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX512F-NEXT: # xmm2 = mem[0,0]
-; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: # xmm1 = mem[0,0]
+; AVX512F-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpgtq %xmm2, %xmm1, %k0
-; AVX512BW-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512BW-NEXT: kxorw %k1, %k0, %k1
-; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 {%k2} = [9223372036854775807,9223372036854775807]
-; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %k0
+; AVX512BW-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
+; AVX512BW-NEXT: kxorw %k1, %k0, %k2
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [9223372036854775807,9223372036854775807]
+; AVX512BW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k2}
; AVX512BW-NEXT: retq
%z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
ret <2 x i64> %z
@@ -1348,62 +1283,46 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
; SSE2-NEXT: pxor %xmm5, %xmm0
; SSE2-NEXT: psubq %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm6
; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
+; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: pxor %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: psubq %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: pxor %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm7
; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: psubq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pxor %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
@@ -1413,62 +1332,46 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
; SSSE3-NEXT: pxor %xmm5, %xmm0
; SSSE3-NEXT: psubq %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSSE3-NEXT: movdqa %xmm0, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm6
; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm7
-; SSSE3-NEXT: pxor %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pandn %xmm4, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0
+; SSSE3-NEXT: pxor %xmm6, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm7
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: pxor %xmm6, %xmm7
+; SSSE3-NEXT: pand %xmm0, %xmm7
+; SSSE3-NEXT: pandn %xmm4, %xmm0
; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: psubq %xmm3, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm7
+; SSSE3-NEXT: pxor %xmm3, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm2
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm1, %xmm5
-; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: psubq %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: pxor %xmm6, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
@@ -1478,113 +1381,90 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483648,2147483648]
; SSE41-NEXT: pxor %xmm6, %xmm0
; SSE41-NEXT: psubq %xmm2, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pxor %xmm6, %xmm5
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: por %xmm0, %xmm8
; SSE41-NEXT: pxor %xmm6, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm5
+; SSE41-NEXT: pand %xmm7, %xmm5
; SSE41-NEXT: por %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm8, %xmm5
-; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movapd %xmm7, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: movapd %xmm8, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: por %xmm6, %xmm2
; SSE41-NEXT: psubq %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm6, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm9
-; SSE41-NEXT: por %xmm0, %xmm9
-; SSE41-NEXT: pxor %xmm6, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm2
-; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
; SSE41-NEXT: movapd %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vxorpd %ymm0, %ymm4, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm5
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: vxorpd %ymm1, %ymm4, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm4, %ymm0
+; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsraq $63, %zmm1, %zmm2
+; AVX512F-NEXT: vpsubq %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsraq $63, %zmm2, %zmm1
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpgtq %ymm2, %ymm1, %k0
-; AVX512BW-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
+; AVX512BW-NEXT: vpcmpgtq %ymm0, %ymm1, %k0
+; AVX512BW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpcmpgtq %ymm0, %ymm1, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k1
-; AVX512BW-NEXT: vpsraq $63, %ymm1, %ymm0
-; AVX512BW-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 {%k1}
-; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1
+; AVX512BW-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 {%k1}
; AVX512BW-NEXT: retq
%z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
ret <4 x i64> %z
@@ -1598,122 +1478,90 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
; SSE2-NEXT: pxor %xmm9, %xmm0
; SSE2-NEXT: psubq %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm10
-; SSE2-NEXT: pxor %xmm9, %xmm10
-; SSE2-NEXT: movdqa %xmm0, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm10
; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm11
-; SSE2-NEXT: pxor %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm4, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm11, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm11, %xmm0
+; SSE2-NEXT: pxor %xmm10, %xmm0
+; SSE2-NEXT: psrad $31, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: pxor %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm0, %xmm11
+; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm11, %xmm0
; SSE2-NEXT: movdqa %xmm8, %xmm1
; SSE2-NEXT: pxor %xmm9, %xmm1
; SSE2-NEXT: psubq %xmm5, %xmm8
-; SSE2-NEXT: movdqa %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm12, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm11
-; SSE2-NEXT: pxor %xmm4, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pand %xmm11, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm11, %xmm1
+; SSE2-NEXT: pxor %xmm5, %xmm1
+; SSE2-NEXT: psrad $31, %xmm11
+; SSE2-NEXT: pxor %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm1, %xmm11
+; SSE2-NEXT: pandn %xmm8, %xmm1
; SSE2-NEXT: por %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: psubq %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: psubq %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm9, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm8
; SSE2-NEXT: pcmpgtd %xmm5, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm8
-; SSE2-NEXT: pxor %xmm5, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm8
-; SSE2-NEXT: por %xmm8, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pxor %xmm10, %xmm8
+; SSE2-NEXT: pand %xmm5, %xmm8
+; SSE2-NEXT: pandn %xmm2, %xmm5
+; SSE2-NEXT: por %xmm8, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: psubq %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm7
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm6
+; SSE2-NEXT: psubq %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm4
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pxor %xmm10, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i64:
@@ -1723,122 +1571,90 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
; SSSE3-NEXT: pxor %xmm9, %xmm0
; SSSE3-NEXT: psubq %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm10
-; SSSE3-NEXT: pxor %xmm9, %xmm10
-; SSSE3-NEXT: movdqa %xmm0, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm10
; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm11
-; SSSE3-NEXT: pxor %xmm10, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm10, %xmm0
+; SSSE3-NEXT: movdqa %xmm4, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: pand %xmm11, %xmm0
-; SSSE3-NEXT: pandn %xmm1, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm11, %xmm0
+; SSSE3-NEXT: pxor %xmm10, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm11
+; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: pxor %xmm10, %xmm11
+; SSSE3-NEXT: pand %xmm0, %xmm11
+; SSSE3-NEXT: pandn %xmm1, %xmm0
; SSSE3-NEXT: por %xmm11, %xmm0
; SSSE3-NEXT: movdqa %xmm8, %xmm1
; SSSE3-NEXT: pxor %xmm9, %xmm1
; SSSE3-NEXT: psubq %xmm5, %xmm8
-; SSSE3-NEXT: movdqa %xmm8, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm1, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11
+; SSSE3-NEXT: pxor %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11
; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm12, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm11
-; SSSE3-NEXT: pxor %xmm4, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: pand %xmm11, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1
+; SSSE3-NEXT: pxor %xmm5, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm11
+; SSSE3-NEXT: pxor %xmm10, %xmm11
+; SSSE3-NEXT: pand %xmm1, %xmm11
+; SSSE3-NEXT: pandn %xmm8, %xmm1
; SSSE3-NEXT: por %xmm11, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: psubq %xmm6, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm5
; SSSE3-NEXT: pxor %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm8
+; SSSE3-NEXT: psubq %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm9, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm8
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8
; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pxor %xmm9, %xmm6
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm8
-; SSSE3-NEXT: pxor %xmm5, %xmm8
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: pandn %xmm2, %xmm8
-; SSSE3-NEXT: por %xmm8, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pand %xmm11, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: psrad $31, %xmm8
+; SSSE3-NEXT: pxor %xmm10, %xmm8
+; SSSE3-NEXT: pand %xmm5, %xmm8
+; SSSE3-NEXT: pandn %xmm2, %xmm5
+; SSSE3-NEXT: por %xmm8, %xmm5
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm9, %xmm2
-; SSSE3-NEXT: psubq %xmm7, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
; SSSE3-NEXT: pand %xmm8, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm9, %xmm7
-; SSSE3-NEXT: movdqa %xmm7, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: psrad $31, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pand %xmm2, %xmm5
-; SSSE3-NEXT: pandn %xmm3, %xmm2
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm5, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm6
+; SSSE3-NEXT: psubq %xmm7, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm6, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pxor %xmm10, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: pandn %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i64:
@@ -1847,22 +1663,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm10 = [2147483648,2147483648]
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: psubq %xmm4, %xmm8
-; SSE41-NEXT: movdqa %xmm8, %xmm9
-; SSE41-NEXT: pxor %xmm10, %xmm9
-; SSE41-NEXT: movdqa %xmm0, %xmm11
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm11, %xmm12
-; SSE41-NEXT: por %xmm0, %xmm12
; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm9
+; SSE41-NEXT: pand %xmm11, %xmm9
; SSE41-NEXT: por %xmm4, %xmm9
-; SSE41-NEXT: pxor %xmm12, %xmm9
+; SSE41-NEXT: pxor %xmm8, %xmm9
; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: movapd %xmm11, %xmm4
@@ -1873,22 +1681,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: psubq %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm13
-; SSE41-NEXT: por %xmm0, %xmm13
; SSE41-NEXT: pxor %xmm10, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm5
+; SSE41-NEXT: movdqa %xmm5, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
+; SSE41-NEXT: pand %xmm9, %xmm4
; SSE41-NEXT: por %xmm5, %xmm4
-; SSE41-NEXT: pxor %xmm13, %xmm4
+; SSE41-NEXT: pxor %xmm1, %xmm4
; SSE41-NEXT: movapd %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5
@@ -1897,22 +1697,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: psubq %xmm6, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm9
-; SSE41-NEXT: por %xmm0, %xmm9
; SSE41-NEXT: pxor %xmm10, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
+; SSE41-NEXT: pand %xmm5, %xmm4
; SSE41-NEXT: por %xmm6, %xmm4
-; SSE41-NEXT: pxor %xmm9, %xmm4
+; SSE41-NEXT: pxor %xmm2, %xmm4
; SSE41-NEXT: movapd %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5
@@ -1920,23 +1712,15 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
+; SSE41-NEXT: pxor %xmm7, %xmm10
+; SSE41-NEXT: movdqa %xmm10, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm4
+; SSE41-NEXT: por %xmm10, %xmm4
; SSE41-NEXT: psubq %xmm7, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: por %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm10, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: por %xmm7, %xmm4
-; SSE41-NEXT: pxor %xmm6, %xmm4
+; SSE41-NEXT: pxor %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11
; SSE41-NEXT: movdqa %xmm4, %xmm0
@@ -1946,74 +1730,65 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
;
; AVX1-LABEL: v8i64:
; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpsubq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX1-NEXT: vxorpd %ymm0, %ymm6, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm6
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2
-; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm7
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpsubq %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm7
-; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX1-NEXT: vxorpd %ymm1, %ymm6, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm6
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2
-; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm6, %ymm1
+; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm7
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
+; AVX1-NEXT: vxorpd %ymm2, %ymm6, %ymm6
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vxorpd %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm8
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm6
+; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT: vxorpd %ymm3, %ymm7, %ymm5
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vxorpd %ymm4, %ymm3, %ymm2
+; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm4
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm2
-; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm2
+; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm4, %ymm0
+; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
+; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm4, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtq %zmm2, %zmm1, %k0
-; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm1
-; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
+; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
-; AVX512-NEXT: vpsraq $63, %zmm1, %zmm0
-; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
%z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
ret <8 x i64> %z
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 746c09e5e70db..57811c0eb8233 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -43,20 +43,21 @@ define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
; SSE-LABEL: ssubo_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psubd %xmm1, %xmm3
-; SSE-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movq %xmm3, (%rdi)
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movq %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ssubo_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX-NEXT: vmovq %xmm1, (%rdi)
@@ -64,9 +65,9 @@ define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: ssubo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k0
; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -84,47 +85,50 @@ define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: ssubo_v3i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psubd %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movq %xmm3, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; SSE2-NEXT: movd %xmm1, 8(%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT: movd %xmm0, 8(%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: ssubo_v3i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: psubd %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: movq %xmm3, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; SSSE3-NEXT: movd %xmm1, 8(%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSSE3-NEXT: movd %xmm0, 8(%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ssubo_v3i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psubd %xmm1, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pextrd $2, %xmm3, 8(%rdi)
-; SSE41-NEXT: movq %xmm3, (%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: ssubo_v3i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi)
@@ -133,9 +137,9 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: ssubo_v3i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k0
; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -154,20 +158,21 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
; SSE-LABEL: ssubo_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psubd %xmm1, %xmm3
-; SSE-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm3, (%rdi)
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ssubo_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX-NEXT: vmovdqa %xmm1, (%rdi)
@@ -175,9 +180,9 @@ define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: ssubo_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k0
; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -199,39 +204,40 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: movd %r8d, %xmm0
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movd %edx, %xmm3
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: movd %r8d, %xmm1
-; SSE2-NEXT: movd %ecx, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movd %edx, %xmm1
-; SSE2-NEXT: movd %esi, %xmm3
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE2-NEXT: movd %r9d, %xmm1
; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psubd %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psubd %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movq %xmm3, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm4, (%rcx)
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: psubd %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: movq %xmm1, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm0, (%rcx)
; SSE2-NEXT: movq %xmm2, 16(%rdi)
-; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm5, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: ssubo_v6i32:
@@ -240,97 +246,99 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT: movd %r8d, %xmm0
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movd %edx, %xmm3
+; SSSE3-NEXT: movd %esi, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: movd %r8d, %xmm1
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: movd %edx, %xmm1
-; SSSE3-NEXT: movd %esi, %xmm3
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSSE3-NEXT: movd %r9d, %xmm1
; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: psubd %xmm0, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: psubd %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm3, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: psubd %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: movq %xmm3, 16(%rcx)
-; SSSE3-NEXT: movdqa %xmm4, (%rcx)
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT: psubd %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: movq %xmm1, 16(%rcx)
+; SSSE3-NEXT: movdqa %xmm0, (%rcx)
; SSSE3-NEXT: movq %xmm2, 16(%rdi)
-; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm5, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ssubo_v6i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
-; SSE41-NEXT: movd %esi, %xmm1
-; SSE41-NEXT: pinsrd $1, %edx, %xmm1
-; SSE41-NEXT: pinsrd $2, %ecx, %xmm1
-; SSE41-NEXT: pinsrd $3, %r8d, %xmm1
-; SSE41-NEXT: movd %r9d, %xmm0
-; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT: movd %esi, %xmm0
+; SSE41-NEXT: pinsrd $1, %edx, %xmm0
+; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT: pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: movd %r9d, %xmm2
; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psubd %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: psubd %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE41-NEXT: pxor %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psubd %xmm2, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movq %xmm1, 16(%rcx)
-; SSE41-NEXT: movdqa %xmm4, (%rcx)
-; SSE41-NEXT: movq %xmm0, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm3, (%rdi)
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: movq %xmm2, 16(%rcx)
+; SSE41-NEXT: movdqa %xmm0, (%rcx)
+; SSE41-NEXT: movq %xmm3, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm5, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: ssubo_v6i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm5
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm4, %ymm0
; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: ssubo_v6i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
@@ -340,9 +348,9 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: ssubo_v6i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0
+; AVX512-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -362,44 +370,47 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; SSE-LABEL: ssubo_v8i32:
; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE-NEXT: psubd %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: psubd %xmm2, %xmm5
-; SSE-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psubd %xmm3, %xmm2
-; SSE-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: movdqa %xmm2, 16(%rdi)
-; SSE-NEXT: movdqa %xmm5, (%rdi)
+; SSE-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE-NEXT: pxor %xmm5, %xmm4
+; SSE-NEXT: movdqa %xmm3, %xmm5
+; SSE-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE-NEXT: psubd %xmm3, %xmm1
+; SSE-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE-NEXT: pxor %xmm5, %xmm2
+; SSE-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: ssubo_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm5
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm4, %ymm0
; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: ssubo_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
@@ -407,9 +418,9 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: ssubo_v8i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0
+; AVX512-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -427,57 +438,64 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind {
; SSE-LABEL: ssubo_v16i32:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm9, %xmm9
-; SSE-NEXT: movdqa %xmm0, %xmm8
-; SSE-NEXT: psubd %xmm4, %xmm8
-; SSE-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE-NEXT: pcmpgtd %xmm8, %xmm0
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psubd %xmm5, %xmm4
-; SSE-NEXT: pcmpgtd %xmm9, %xmm5
-; SSE-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE-NEXT: pxor %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: psubd %xmm6, %xmm5
-; SSE-NEXT: pcmpgtd %xmm9, %xmm6
-; SSE-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm3, %xmm6
-; SSE-NEXT: psubd %xmm7, %xmm6
-; SSE-NEXT: pcmpgtd %xmm9, %xmm7
-; SSE-NEXT: pcmpgtd %xmm6, %xmm3
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: movdqa %xmm6, 48(%rdi)
-; SSE-NEXT: movdqa %xmm5, 32(%rdi)
-; SSE-NEXT: movdqa %xmm4, 16(%rdi)
-; SSE-NEXT: movdqa %xmm8, (%rdi)
+; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE-NEXT: psubd %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pxor %xmm8, %xmm8
+; SSE-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE-NEXT: pxor %xmm9, %xmm8
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pcmpgtd %xmm1, %xmm9
+; SSE-NEXT: psubd %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE-NEXT: pxor %xmm9, %xmm5
+; SSE-NEXT: movdqa %xmm6, %xmm9
+; SSE-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE-NEXT: psubd %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE-NEXT: pxor %xmm9, %xmm6
+; SSE-NEXT: movdqa %xmm7, %xmm9
+; SSE-NEXT: pcmpgtd %xmm3, %xmm9
+; SSE-NEXT: psubd %xmm7, %xmm3
+; SSE-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE-NEXT: pxor %xmm9, %xmm4
+; SSE-NEXT: movdqa %xmm3, 48(%rdi)
+; SSE-NEXT: movdqa %xmm2, 32(%rdi)
+; SSE-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm8, %xmm0
+; SSE-NEXT: movdqa %xmm5, %xmm1
+; SSE-NEXT: movdqa %xmm6, %xmm2
+; SSE-NEXT: movdqa %xmm4, %xmm3
; SSE-NEXT: retq
;
; AVX1-LABEL: ssubo_v16i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6
+; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm7
; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm7
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm7
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
-; AVX1-NEXT: vpsubd %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm7, %xmm8
+; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7
; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm8
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm8, %xmm0
; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
@@ -497,18 +515,18 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
;
; AVX2-LABEL: ssubo_v16i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm5
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm3, %ymm4
; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm4
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm5
+; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX2-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
@@ -517,9 +535,9 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
;
; AVX512-LABEL: ssubo_v16i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k0
+; AVX512-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
@@ -761,26 +779,22 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: movdqa %xmm0, (%rdi)
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: ssubo_v2i64:
@@ -788,26 +802,22 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: psubq %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ssubo_v2i64:
@@ -815,33 +825,29 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: pxor %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm2, %xmm3
; SSE41-NEXT: psubq %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE41-NEXT: movdqa %xmm0, (%rdi)
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: ssubo_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
+; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -850,9 +856,9 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: ssubo_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpgtq %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpgtq %xmm0, %xmm1, %k0
; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
More information about the llvm-commits
mailing list