[llvm] r373187 - [TargetLowering] Simplify expansion of S{ADD,SUB}O
Roger Ferrer Ibanez via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 30 00:58:51 PDT 2019
Author: rogfer01
Date: Mon Sep 30 00:58:50 2019
New Revision: 373187
URL: http://llvm.org/viewvc/llvm-project?rev=373187&view=rev
Log:
[TargetLowering] Simplify expansion of S{ADD,SUB}O
ISD::SADDO uses the suggested sequence described in the section ยง2.4 of
the RISCV Spec v2.2. ISD::SSUBO uses the dual approach but checking for
(non-zero) positive.
Differential Revision: https://reviews.llvm.org/D47927
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll
llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll
llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll
llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll
llvm/trunk/test/CodeGen/AMDGPU/saddo.ll
llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll
llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll
llvm/trunk/test/CodeGen/X86/combine-mulo.ll
llvm/trunk/test/CodeGen/X86/mulo-pow2.ll
llvm/trunk/test/CodeGen/X86/sadd_sat.ll
llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll
llvm/trunk/test/CodeGen/X86/ssub_sat.ll
llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll
llvm/trunk/test/CodeGen/X86/vec_saddo.ll
llvm/trunk/test/CodeGen/X86/vec_ssubo.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Mon Sep 30 00:58:50 2019
@@ -6907,24 +6907,19 @@ void TargetLowering::expandSADDSUBO(
SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
- // LHSSign -> LHS >= 0
- // RHSSign -> RHS >= 0
- // SumSign -> Result >= 0
- //
- // Add:
- // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
- // Sub:
- // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
- SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
- SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
- SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
- IsAdd ? ISD::SETEQ : ISD::SETNE);
+ // For an addition, the result should be less than one of the operands (LHS)
+ // if and only if the other operand (RHS) is negative, otherwise there will
+ // be overflow.
+ // For a subtraction, the result should be less than one of the operands
+ // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
+ // otherwise there will be overflow.
+ SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
+ SDValue ConditionRHS =
+ DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT);
- SDValue SumSign = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETGE);
- SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
-
- SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
- Overflow = DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType);
+ Overflow = DAG.getBoolExtOrTrunc(
+ DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
+ ResultType, ResultType);
}
bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
Modified: llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/sadd_sat.ll Mon Sep 30 00:58:50 2019
@@ -54,17 +54,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x
; CHECK-LABEL: vec:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmge v1.4s, v1.4s, #0
-; CHECK-NEXT: cmge v0.4s, v0.4s, #0
-; CHECK-NEXT: cmge v5.4s, v2.4s, #0
; CHECK-NEXT: cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
; CHECK-NEXT: mvni v3.4s, #128, lsl #24
+; CHECK-NEXT: cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
Modified: llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/sadd_sat_vec.ll Mon Sep 30 00:58:50 2019
@@ -36,17 +36,13 @@ define <16 x i8> @v16i8(<16 x i8> %x, <1
; CHECK-LABEL: v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v5.16b, v2.16b, #0
; CHECK-NEXT: cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v3.16b, #127
+; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
@@ -57,29 +53,21 @@ define <32 x i8> @v32i8(<32 x i8> %x, <3
; CHECK-LABEL: v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add v4.16b, v0.16b, v2.16b
-; CHECK-NEXT: cmlt v16.16b, v4.16b, #0
+; CHECK-NEXT: cmlt v7.16b, v4.16b, #0
; CHECK-NEXT: movi v6.16b, #127
+; CHECK-NEXT: mvn v16.16b, v7.16b
+; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: add v7.16b, v1.16b, v3.16b
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT: cmlt v2.16b, v2.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v4.16b
; CHECK-NEXT: cmlt v16.16b, v7.16b, #0
; CHECK-NEXT: movi v5.16b, #127
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT: cmge v2.16b, v2.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v16.16b, v4.16b, #0
-; CHECK-NEXT: cmge v3.16b, v3.16b, #0
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmeq v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v16.16b
-; CHECK-NEXT: cmge v16.16b, v7.16b, #0
-; CHECK-NEXT: cmeq v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: cmeq v1.16b, v1.16b, v16.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: cmlt v3.16b, v3.16b, #0
+; CHECK-NEXT: cmgt v1.16b, v1.16b, v7.16b
+; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v16.16b
+; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@@ -102,42 +90,26 @@ define <64 x i8> @v64i8(<64 x i8> %x, <6
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.16b, v21.16b, #0
+; CHECK-NEXT: cmlt v4.16b, v4.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v16.16b
; CHECK-NEXT: movi v22.16b, #127
; CHECK-NEXT: add v23.16b, v3.16b, v7.16b
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT: cmlt v4.16b, v5.16b, #0
+; CHECK-NEXT: cmgt v1.16b, v1.16b, v19.16b
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.16b, v23.16b, #0
+; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT: cmlt v4.16b, v6.16b, #0
+; CHECK-NEXT: cmgt v2.16b, v2.16b, v21.16b
; CHECK-NEXT: movi v17.16b, #127
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: cmlt v4.16b, v7.16b, #0
+; CHECK-NEXT: cmgt v3.16b, v3.16b, v23.16b
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT: cmge v4.16b, v4.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v24.16b, v16.16b, #0
-; CHECK-NEXT: cmge v5.16b, v5.16b, #0
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v24.16b
-; CHECK-NEXT: cmge v24.16b, v19.16b, #0
-; CHECK-NEXT: cmge v6.16b, v6.16b, #0
-; CHECK-NEXT: cmge v2.16b, v2.16b, #0
-; CHECK-NEXT: cmeq v5.16b, v1.16b, v5.16b
-; CHECK-NEXT: cmeq v1.16b, v1.16b, v24.16b
-; CHECK-NEXT: cmge v24.16b, v21.16b, #0
-; CHECK-NEXT: cmge v7.16b, v7.16b, #0
-; CHECK-NEXT: cmge v3.16b, v3.16b, #0
-; CHECK-NEXT: cmeq v6.16b, v2.16b, v6.16b
-; CHECK-NEXT: cmeq v2.16b, v2.16b, v24.16b
-; CHECK-NEXT: cmge v24.16b, v23.16b, #0
-; CHECK-NEXT: cmeq v7.16b, v3.16b, v7.16b
-; CHECK-NEXT: cmeq v3.16b, v3.16b, v24.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT: and v2.16b, v6.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v7.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@@ -151,17 +123,13 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8
; CHECK-LABEL: v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.8h, v0.8h, v1.8h
-; CHECK-NEXT: cmge v1.8h, v1.8h, #0
-; CHECK-NEXT: cmge v0.8h, v0.8h, #0
-; CHECK-NEXT: cmge v5.8h, v2.8h, #0
; CHECK-NEXT: cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT: cmeq v1.8h, v0.8h, v1.8h
-; CHECK-NEXT: cmeq v0.8h, v0.8h, v5.8h
; CHECK-NEXT: mvni v3.8h, #128, lsl #8
+; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT: cmgt v0.8h, v0.8h, v2.8h
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
@@ -172,29 +140,21 @@ define <16 x i16> @v16i16(<16 x i16> %x,
; CHECK-LABEL: v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: add v4.8h, v0.8h, v2.8h
-; CHECK-NEXT: cmlt v16.8h, v4.8h, #0
+; CHECK-NEXT: cmlt v7.8h, v4.8h, #0
; CHECK-NEXT: mvni v6.8h, #128, lsl #8
+; CHECK-NEXT: mvn v16.16b, v7.16b
+; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: add v7.8h, v1.8h, v3.8h
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT: cmgt v0.8h, v0.8h, v4.8h
; CHECK-NEXT: cmlt v16.8h, v7.8h, #0
; CHECK-NEXT: mvni v5.8h, #128, lsl #8
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT: cmge v2.8h, v2.8h, #0
-; CHECK-NEXT: cmge v0.8h, v0.8h, #0
-; CHECK-NEXT: cmge v16.8h, v4.8h, #0
-; CHECK-NEXT: cmge v3.8h, v3.8h, #0
-; CHECK-NEXT: cmge v1.8h, v1.8h, #0
-; CHECK-NEXT: cmeq v2.8h, v0.8h, v2.8h
-; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h
-; CHECK-NEXT: cmge v16.8h, v7.8h, #0
-; CHECK-NEXT: cmeq v3.8h, v1.8h, v3.8h
-; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: cmlt v3.8h, v3.8h, #0
+; CHECK-NEXT: cmgt v1.8h, v1.8h, v7.8h
+; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v16.16b
+; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@@ -217,42 +177,26 @@ define <32 x i16> @v32i16(<32 x i16> %x,
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.8h, v21.8h, #0
+; CHECK-NEXT: cmlt v4.8h, v4.8h, #0
+; CHECK-NEXT: cmgt v0.8h, v0.8h, v16.8h
; CHECK-NEXT: mvni v22.8h, #128, lsl #8
; CHECK-NEXT: add v23.8h, v3.8h, v7.8h
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT: cmlt v4.8h, v5.8h, #0
+; CHECK-NEXT: cmgt v1.8h, v1.8h, v19.8h
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.8h, v23.8h, #0
+; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT: cmlt v4.8h, v6.8h, #0
+; CHECK-NEXT: cmgt v2.8h, v2.8h, v21.8h
; CHECK-NEXT: mvni v17.8h, #128, lsl #8
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: cmlt v4.8h, v7.8h, #0
+; CHECK-NEXT: cmgt v3.8h, v3.8h, v23.8h
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT: cmge v4.8h, v4.8h, #0
-; CHECK-NEXT: cmge v0.8h, v0.8h, #0
-; CHECK-NEXT: cmge v24.8h, v16.8h, #0
-; CHECK-NEXT: cmge v5.8h, v5.8h, #0
-; CHECK-NEXT: cmge v1.8h, v1.8h, #0
-; CHECK-NEXT: cmeq v4.8h, v0.8h, v4.8h
-; CHECK-NEXT: cmeq v0.8h, v0.8h, v24.8h
-; CHECK-NEXT: cmge v24.8h, v19.8h, #0
-; CHECK-NEXT: cmge v6.8h, v6.8h, #0
-; CHECK-NEXT: cmge v2.8h, v2.8h, #0
-; CHECK-NEXT: cmeq v5.8h, v1.8h, v5.8h
-; CHECK-NEXT: cmeq v1.8h, v1.8h, v24.8h
-; CHECK-NEXT: cmge v24.8h, v21.8h, #0
-; CHECK-NEXT: cmge v7.8h, v7.8h, #0
-; CHECK-NEXT: cmge v3.8h, v3.8h, #0
-; CHECK-NEXT: cmeq v6.8h, v2.8h, v6.8h
-; CHECK-NEXT: cmeq v2.8h, v2.8h, v24.8h
-; CHECK-NEXT: cmge v24.8h, v23.8h, #0
-; CHECK-NEXT: cmeq v7.8h, v3.8h, v7.8h
-; CHECK-NEXT: cmeq v3.8h, v3.8h, v24.8h
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT: and v2.16b, v6.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v7.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@@ -269,16 +213,12 @@ define void @v8i8(<8 x i8>* %px, <8 x i8
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: movi v2.8b, #127
; CHECK-NEXT: add v3.8b, v0.8b, v1.8b
-; CHECK-NEXT: cmge v1.8b, v1.8b, #0
-; CHECK-NEXT: cmge v0.8b, v0.8b, #0
-; CHECK-NEXT: cmge v5.8b, v3.8b, #0
; CHECK-NEXT: cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b
-; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b
+; CHECK-NEXT: cmlt v1.8b, v1.8b, #0
+; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
@@ -311,17 +251,13 @@ define void @v4i8(<4 x i8>* %px, <4 x i8
; CHECK-NEXT: shl v1.4h, v1.4h, #8
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: add v3.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmge v1.4h, v1.4h, #0
-; CHECK-NEXT: cmge v0.4h, v0.4h, #0
-; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
+; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
+; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-NEXT: xtn v0.8b, v0.8h
@@ -348,17 +284,13 @@ define void @v2i8(<2 x i8>* %px, <2 x i8
; CHECK-NEXT: shl v2.2s, v2.2s, #24
; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: add v3.2s, v0.2s, v2.2s
-; CHECK-NEXT: cmge v2.2s, v2.2s, #0
-; CHECK-NEXT: cmge v0.2s, v0.2s, #0
-; CHECK-NEXT: cmge v5.2s, v3.2s, #0
; CHECK-NEXT: cmlt v4.2s, v3.2s, #0
-; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s
-; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v1.2s, #128, lsl #24
+; CHECK-NEXT: cmlt v2.2s, v2.2s, #0
+; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b
; CHECK-NEXT: ushr v0.2s, v0.2s, #24
; CHECK-NEXT: mov w8, v0.s[1]
@@ -380,16 +312,12 @@ define void @v4i16(<4 x i16>* %px, <4 x
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: add v3.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmge v1.4h, v1.4h, #0
-; CHECK-NEXT: cmge v0.4h, v0.4h, #0
-; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
+; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
+; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
@@ -414,17 +342,13 @@ define void @v2i16(<2 x i16>* %px, <2 x
; CHECK-NEXT: shl v2.2s, v2.2s, #16
; CHECK-NEXT: shl v0.2s, v0.2s, #16
; CHECK-NEXT: add v3.2s, v0.2s, v2.2s
-; CHECK-NEXT: cmge v2.2s, v2.2s, #0
-; CHECK-NEXT: cmge v0.2s, v0.2s, #0
-; CHECK-NEXT: cmge v5.2s, v3.2s, #0
; CHECK-NEXT: cmlt v4.2s, v3.2s, #0
-; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s
-; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v1.2s, #128, lsl #24
+; CHECK-NEXT: cmlt v2.2s, v2.2s, #0
+; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b
; CHECK-NEXT: ushr v0.2s, v0.2s, #16
; CHECK-NEXT: mov w8, v0.s[1]
@@ -443,17 +367,13 @@ define <12 x i8> @v12i8(<12 x i8> %x, <1
; CHECK-LABEL: v12i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v5.16b, v2.16b, #0
; CHECK-NEXT: cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v3.16b, #127
+; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
@@ -468,27 +388,19 @@ define void @v12i16(<12 x i16>* %px, <12
; CHECK-NEXT: mvni v5.8h, #128, lsl #8
; CHECK-NEXT: mvni v4.8h, #128, lsl #8
; CHECK-NEXT: add v6.8h, v1.8h, v2.8h
-; CHECK-NEXT: cmlt v16.8h, v6.8h, #0
+; CHECK-NEXT: cmlt v7.8h, v6.8h, #0
+; CHECK-NEXT: mvn v16.16b, v7.16b
+; CHECK-NEXT: bsl v5.16b, v7.16b, v16.16b
; CHECK-NEXT: add v7.8h, v0.8h, v3.8h
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
+; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT: cmgt v1.8h, v1.8h, v6.8h
; CHECK-NEXT: cmlt v16.8h, v7.8h, #0
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v4.16b, v16.16b, v17.16b
-; CHECK-NEXT: cmge v2.8h, v2.8h, #0
-; CHECK-NEXT: cmge v1.8h, v1.8h, #0
-; CHECK-NEXT: cmge v16.8h, v6.8h, #0
-; CHECK-NEXT: cmge v3.8h, v3.8h, #0
-; CHECK-NEXT: cmge v0.8h, v0.8h, #0
-; CHECK-NEXT: cmeq v2.8h, v1.8h, v2.8h
-; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h
-; CHECK-NEXT: cmge v16.8h, v7.8h, #0
-; CHECK-NEXT: cmeq v3.8h, v0.8h, v3.8h
-; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: cmlt v3.8h, v3.8h, #0
+; CHECK-NEXT: cmgt v0.8h, v0.8h, v7.8h
+; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b
+; CHECK-NEXT: mvn v2.16b, v16.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: bsl v4.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v6.16b
; CHECK-NEXT: bsl v0.16b, v4.16b, v7.16b
; CHECK-NEXT: str q0, [x2]
@@ -508,16 +420,12 @@ define void @v1i8(<1 x i8>* %px, <1 x i8
; CHECK-NEXT: ldr b1, [x1]
; CHECK-NEXT: movi v2.8b, #127
; CHECK-NEXT: add v3.8b, v0.8b, v1.8b
-; CHECK-NEXT: cmge v1.8b, v1.8b, #0
-; CHECK-NEXT: cmge v0.8b, v0.8b, #0
-; CHECK-NEXT: cmge v5.8b, v3.8b, #0
; CHECK-NEXT: cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b
-; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b
+; CHECK-NEXT: cmlt v1.8b, v1.8b, #0
+; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: st1 { v0.b }[0], [x2]
; CHECK-NEXT: ret
@@ -535,16 +443,12 @@ define void @v1i16(<1 x i16>* %px, <1 x
; CHECK-NEXT: ldr h1, [x1]
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: add v3.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmge v1.4h, v1.4h, #0
-; CHECK-NEXT: cmge v0.4h, v0.4h, #0
-; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
+; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
+; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str h0, [x2]
; CHECK-NEXT: ret
@@ -561,17 +465,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <1
; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: shl v0.16b, v0.16b, #4
; CHECK-NEXT: add v3.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v5.16b, v3.16b, #0
; CHECK-NEXT: cmlt v4.16b, v3.16b, #0
-; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v2.16b, #127
+; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
; CHECK-NEXT: sshr v0.16b, v0.16b, #4
; CHECK-NEXT: ret
@@ -585,17 +485,13 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: add v3.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v5.16b, v3.16b, #0
; CHECK-NEXT: cmlt v4.16b, v3.16b, #0
-; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v2.16b, #127
+; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
; CHECK-NEXT: sshr v0.16b, v0.16b, #7
; CHECK-NEXT: ret
@@ -607,17 +503,13 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2
; CHECK-LABEL: v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.2s, v0.2s, v1.2s
-; CHECK-NEXT: cmge v1.2s, v1.2s, #0
-; CHECK-NEXT: cmge v0.2s, v0.2s, #0
-; CHECK-NEXT: cmge v5.2s, v2.2s, #0
; CHECK-NEXT: cmlt v4.2s, v2.2s, #0
-; CHECK-NEXT: cmeq v1.2s, v0.2s, v1.2s
-; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v3.2s, #128, lsl #24
+; CHECK-NEXT: cmlt v1.2s, v1.2s, #0
+; CHECK-NEXT: cmgt v0.2s, v0.2s, v2.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v3.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v3.8b, v2.8b
; CHECK-NEXT: ret
%z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@@ -628,17 +520,13 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4
; CHECK-LABEL: v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmge v1.4s, v1.4s, #0
-; CHECK-NEXT: cmge v0.4s, v0.4s, #0
-; CHECK-NEXT: cmge v5.4s, v2.4s, #0
; CHECK-NEXT: cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
; CHECK-NEXT: mvni v3.4s, #128, lsl #24
+; CHECK-NEXT: cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@@ -649,29 +537,21 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8
; CHECK-LABEL: v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add v4.4s, v0.4s, v2.4s
-; CHECK-NEXT: cmlt v16.4s, v4.4s, #0
+; CHECK-NEXT: cmlt v7.4s, v4.4s, #0
; CHECK-NEXT: mvni v6.4s, #128, lsl #24
+; CHECK-NEXT: mvn v16.16b, v7.16b
+; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: add v7.4s, v1.4s, v3.4s
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT: cmlt v2.4s, v2.4s, #0
+; CHECK-NEXT: cmgt v0.4s, v0.4s, v4.4s
; CHECK-NEXT: cmlt v16.4s, v7.4s, #0
; CHECK-NEXT: mvni v5.4s, #128, lsl #24
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT: cmge v2.4s, v2.4s, #0
-; CHECK-NEXT: cmge v0.4s, v0.4s, #0
-; CHECK-NEXT: cmge v16.4s, v4.4s, #0
-; CHECK-NEXT: cmge v3.4s, v3.4s, #0
-; CHECK-NEXT: cmge v1.4s, v1.4s, #0
-; CHECK-NEXT: cmeq v2.4s, v0.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v16.4s
-; CHECK-NEXT: cmge v16.4s, v7.4s, #0
-; CHECK-NEXT: cmeq v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v1.4s, v1.4s, v16.4s
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: cmlt v3.4s, v3.4s, #0
+; CHECK-NEXT: cmgt v1.4s, v1.4s, v7.4s
+; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v16.16b
+; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@@ -694,42 +574,26 @@ define <16 x i32> @v16i32(<16 x i32> %x,
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.4s, v21.4s, #0
+; CHECK-NEXT: cmlt v4.4s, v4.4s, #0
+; CHECK-NEXT: cmgt v0.4s, v0.4s, v16.4s
; CHECK-NEXT: mvni v22.4s, #128, lsl #24
; CHECK-NEXT: add v23.4s, v3.4s, v7.4s
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT: cmlt v4.4s, v5.4s, #0
+; CHECK-NEXT: cmgt v1.4s, v1.4s, v19.4s
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.4s, v23.4s, #0
+; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT: cmlt v4.4s, v6.4s, #0
+; CHECK-NEXT: cmgt v2.4s, v2.4s, v21.4s
; CHECK-NEXT: mvni v17.4s, #128, lsl #24
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: cmlt v4.4s, v7.4s, #0
+; CHECK-NEXT: cmgt v3.4s, v3.4s, v23.4s
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT: cmge v4.4s, v4.4s, #0
-; CHECK-NEXT: cmge v0.4s, v0.4s, #0
-; CHECK-NEXT: cmge v24.4s, v16.4s, #0
-; CHECK-NEXT: cmge v5.4s, v5.4s, #0
-; CHECK-NEXT: cmge v1.4s, v1.4s, #0
-; CHECK-NEXT: cmeq v4.4s, v0.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v24.4s
-; CHECK-NEXT: cmge v24.4s, v19.4s, #0
-; CHECK-NEXT: cmge v6.4s, v6.4s, #0
-; CHECK-NEXT: cmge v2.4s, v2.4s, #0
-; CHECK-NEXT: cmeq v5.4s, v1.4s, v5.4s
-; CHECK-NEXT: cmeq v1.4s, v1.4s, v24.4s
-; CHECK-NEXT: cmge v24.4s, v21.4s, #0
-; CHECK-NEXT: cmge v7.4s, v7.4s, #0
-; CHECK-NEXT: cmge v3.4s, v3.4s, #0
-; CHECK-NEXT: cmeq v6.4s, v2.4s, v6.4s
-; CHECK-NEXT: cmeq v2.4s, v2.4s, v24.4s
-; CHECK-NEXT: cmge v24.4s, v23.4s, #0
-; CHECK-NEXT: cmeq v7.4s, v3.4s, v7.4s
-; CHECK-NEXT: cmeq v3.4s, v3.4s, v24.4s
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT: and v2.16b, v6.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v7.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@@ -743,18 +607,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
; CHECK-LABEL: v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.2d, v0.2d, v1.2d
-; CHECK-NEXT: cmge v1.2d, v1.2d, #0
-; CHECK-NEXT: cmge v0.2d, v0.2d, #0
-; CHECK-NEXT: cmge v5.2d, v2.2d, #0
; CHECK-NEXT: mov x8, #9223372036854775807
; CHECK-NEXT: cmlt v3.2d, v2.2d, #0
-; CHECK-NEXT: cmeq v1.2d, v0.2d, v1.2d
-; CHECK-NEXT: cmeq v0.2d, v0.2d, v5.2d
+; CHECK-NEXT: cmlt v1.2d, v1.2d, #0
; CHECK-NEXT: dup v4.2d, x8
+; CHECK-NEXT: cmgt v0.2d, v0.2d, v2.2d
; CHECK-NEXT: mvn v5.16b, v3.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v4.16b, v3.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b
; CHECK-NEXT: ret
%z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
@@ -766,31 +626,23 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4
; CHECK: // %bb.0:
; CHECK-NEXT: add v4.2d, v0.2d, v2.2d
; CHECK-NEXT: mov x8, #9223372036854775807
-; CHECK-NEXT: cmlt v6.2d, v4.2d, #0
-; CHECK-NEXT: dup v7.2d, x8
+; CHECK-NEXT: cmlt v5.2d, v4.2d, #0
+; CHECK-NEXT: dup v6.2d, x8
+; CHECK-NEXT: mvn v7.16b, v5.16b
+; CHECK-NEXT: mov v16.16b, v6.16b
+; CHECK-NEXT: bsl v16.16b, v5.16b, v7.16b
; CHECK-NEXT: add v5.2d, v1.2d, v3.2d
-; CHECK-NEXT: mvn v16.16b, v6.16b
-; CHECK-NEXT: mov v17.16b, v7.16b
-; CHECK-NEXT: bsl v17.16b, v6.16b, v16.16b
-; CHECK-NEXT: cmlt v6.2d, v5.2d, #0
-; CHECK-NEXT: mvn v16.16b, v6.16b
-; CHECK-NEXT: bsl v7.16b, v6.16b, v16.16b
-; CHECK-NEXT: cmge v2.2d, v2.2d, #0
-; CHECK-NEXT: cmge v0.2d, v0.2d, #0
-; CHECK-NEXT: cmge v6.2d, v4.2d, #0
-; CHECK-NEXT: cmge v3.2d, v3.2d, #0
-; CHECK-NEXT: cmge v1.2d, v1.2d, #0
-; CHECK-NEXT: cmeq v2.2d, v0.2d, v2.2d
-; CHECK-NEXT: cmeq v0.2d, v0.2d, v6.2d
-; CHECK-NEXT: cmge v6.2d, v5.2d, #0
-; CHECK-NEXT: cmeq v3.2d, v1.2d, v3.2d
-; CHECK-NEXT: cmeq v1.2d, v1.2d, v6.2d
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT: bsl v0.16b, v17.16b, v4.16b
-; CHECK-NEXT: bsl v1.16b, v7.16b, v5.16b
+; CHECK-NEXT: cmlt v2.2d, v2.2d, #0
+; CHECK-NEXT: cmgt v0.2d, v0.2d, v4.2d
+; CHECK-NEXT: cmlt v7.2d, v5.2d, #0
+; CHECK-NEXT: cmlt v3.2d, v3.2d, #0
+; CHECK-NEXT: cmgt v1.2d, v1.2d, v5.2d
+; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v7.16b
+; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bsl v6.16b, v7.16b, v2.16b
+; CHECK-NEXT: bsl v0.16b, v16.16b, v4.16b
+; CHECK-NEXT: bsl v1.16b, v6.16b, v5.16b
; CHECK-NEXT: ret
%z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
ret <4 x i64> %z
@@ -812,42 +664,26 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
; CHECK-NEXT: bsl v25.16b, v20.16b, v24.16b
; CHECK-NEXT: mvn v20.16b, v22.16b
; CHECK-NEXT: mov v24.16b, v21.16b
+; CHECK-NEXT: cmlt v4.2d, v4.2d, #0
+; CHECK-NEXT: cmgt v0.2d, v0.2d, v16.2d
; CHECK-NEXT: add v19.2d, v3.2d, v7.2d
; CHECK-NEXT: bsl v24.16b, v22.16b, v20.16b
; CHECK-NEXT: mvn v20.16b, v23.16b
; CHECK-NEXT: mov v22.16b, v21.16b
+; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT: cmlt v4.2d, v5.2d, #0
+; CHECK-NEXT: cmgt v1.2d, v1.2d, v17.2d
; CHECK-NEXT: bsl v22.16b, v23.16b, v20.16b
; CHECK-NEXT: cmlt v20.2d, v19.2d, #0
+; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT: cmlt v4.2d, v6.2d, #0
+; CHECK-NEXT: cmgt v2.2d, v2.2d, v18.2d
; CHECK-NEXT: mvn v23.16b, v20.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: cmlt v4.2d, v7.2d, #0
+; CHECK-NEXT: cmgt v3.2d, v3.2d, v19.2d
; CHECK-NEXT: bsl v21.16b, v20.16b, v23.16b
-; CHECK-NEXT: cmge v4.2d, v4.2d, #0
-; CHECK-NEXT: cmge v0.2d, v0.2d, #0
-; CHECK-NEXT: cmge v20.2d, v16.2d, #0
-; CHECK-NEXT: cmge v5.2d, v5.2d, #0
-; CHECK-NEXT: cmge v1.2d, v1.2d, #0
-; CHECK-NEXT: cmeq v4.2d, v0.2d, v4.2d
-; CHECK-NEXT: cmeq v0.2d, v0.2d, v20.2d
-; CHECK-NEXT: cmge v20.2d, v17.2d, #0
-; CHECK-NEXT: cmge v6.2d, v6.2d, #0
-; CHECK-NEXT: cmge v2.2d, v2.2d, #0
-; CHECK-NEXT: cmeq v5.2d, v1.2d, v5.2d
-; CHECK-NEXT: cmeq v1.2d, v1.2d, v20.2d
-; CHECK-NEXT: cmge v20.2d, v18.2d, #0
-; CHECK-NEXT: cmge v7.2d, v7.2d, #0
-; CHECK-NEXT: cmge v3.2d, v3.2d, #0
-; CHECK-NEXT: cmeq v6.2d, v2.2d, v6.2d
-; CHECK-NEXT: cmeq v2.2d, v2.2d, v20.2d
-; CHECK-NEXT: cmge v20.2d, v19.2d, #0
-; CHECK-NEXT: cmeq v7.2d, v3.2d, v7.2d
-; CHECK-NEXT: cmeq v3.2d, v3.2d, v20.2d
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT: and v2.16b, v6.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v7.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v25.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v24.16b, v17.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v18.16b
Modified: llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/ssub_sat.ll Mon Sep 30 00:58:50 2019
@@ -54,18 +54,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x
; CHECK-LABEL: vec:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmge v1.4s, v1.4s, #0
-; CHECK-NEXT: cmge v0.4s, v0.4s, #0
-; CHECK-NEXT: cmge v5.4s, v2.4s, #0
; CHECK-NEXT: cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
; CHECK-NEXT: mvni v3.4s, #128, lsl #24
+; CHECK-NEXT: cmgt v1.4s, v1.4s, #0
+; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
Modified: llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/ssub_sat_vec.ll Mon Sep 30 00:58:50 2019
@@ -37,18 +37,13 @@ define <16 x i8> @v16i8(<16 x i8> %x, <1
; CHECK-LABEL: v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v5.16b, v2.16b, #0
; CHECK-NEXT: cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v3.16b, #127
+; CHECK-NEXT: cmgt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
@@ -59,31 +54,21 @@ define <32 x i8> @v32i8(<32 x i8> %x, <3
; CHECK-LABEL: v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v4.16b, v0.16b, v2.16b
-; CHECK-NEXT: cmlt v16.16b, v4.16b, #0
+; CHECK-NEXT: cmlt v7.16b, v4.16b, #0
; CHECK-NEXT: movi v6.16b, #127
+; CHECK-NEXT: mvn v16.16b, v7.16b
+; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: sub v7.16b, v1.16b, v3.16b
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT: cmgt v2.16b, v2.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v4.16b
; CHECK-NEXT: cmlt v16.16b, v7.16b, #0
; CHECK-NEXT: movi v5.16b, #127
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT: cmge v2.16b, v2.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v16.16b, v4.16b, #0
-; CHECK-NEXT: cmge v3.16b, v3.16b, #0
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmeq v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v16.16b
-; CHECK-NEXT: cmge v16.16b, v7.16b, #0
-; CHECK-NEXT: cmeq v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: cmeq v1.16b, v1.16b, v16.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: cmgt v3.16b, v3.16b, #0
+; CHECK-NEXT: cmgt v1.16b, v1.16b, v7.16b
+; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v16.16b
+; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@@ -106,46 +91,26 @@ define <64 x i8> @v64i8(<64 x i8> %x, <6
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.16b, v21.16b, #0
+; CHECK-NEXT: cmgt v4.16b, v4.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v16.16b
; CHECK-NEXT: movi v22.16b, #127
; CHECK-NEXT: sub v23.16b, v3.16b, v7.16b
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT: cmgt v4.16b, v5.16b, #0
+; CHECK-NEXT: cmgt v1.16b, v1.16b, v19.16b
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.16b, v23.16b, #0
+; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT: cmgt v4.16b, v6.16b, #0
+; CHECK-NEXT: cmgt v2.16b, v2.16b, v21.16b
; CHECK-NEXT: movi v17.16b, #127
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: cmgt v4.16b, v7.16b, #0
+; CHECK-NEXT: cmgt v3.16b, v3.16b, v23.16b
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT: cmge v4.16b, v4.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v24.16b, v16.16b, #0
-; CHECK-NEXT: cmge v5.16b, v5.16b, #0
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v24.16b
-; CHECK-NEXT: cmge v24.16b, v19.16b, #0
-; CHECK-NEXT: cmge v6.16b, v6.16b, #0
-; CHECK-NEXT: cmge v2.16b, v2.16b, #0
-; CHECK-NEXT: cmeq v5.16b, v1.16b, v5.16b
-; CHECK-NEXT: cmeq v1.16b, v1.16b, v24.16b
-; CHECK-NEXT: cmge v24.16b, v21.16b, #0
-; CHECK-NEXT: mvn v4.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: cmge v7.16b, v7.16b, #0
-; CHECK-NEXT: cmge v3.16b, v3.16b, #0
-; CHECK-NEXT: cmeq v6.16b, v2.16b, v6.16b
-; CHECK-NEXT: cmeq v2.16b, v2.16b, v24.16b
-; CHECK-NEXT: cmge v24.16b, v23.16b, #0
-; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: mvn v4.16b, v5.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: cmeq v7.16b, v3.16b, v7.16b
-; CHECK-NEXT: cmeq v3.16b, v3.16b, v24.16b
-; CHECK-NEXT: and v1.16b, v4.16b, v1.16b
-; CHECK-NEXT: mvn v4.16b, v6.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT: mvn v4.16b, v7.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: and v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@@ -159,18 +124,13 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8
; CHECK-LABEL: v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.8h, v0.8h, v1.8h
-; CHECK-NEXT: cmge v1.8h, v1.8h, #0
-; CHECK-NEXT: cmge v0.8h, v0.8h, #0
-; CHECK-NEXT: cmge v5.8h, v2.8h, #0
; CHECK-NEXT: cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT: cmeq v1.8h, v0.8h, v1.8h
-; CHECK-NEXT: cmeq v0.8h, v0.8h, v5.8h
; CHECK-NEXT: mvni v3.8h, #128, lsl #8
+; CHECK-NEXT: cmgt v1.8h, v1.8h, #0
+; CHECK-NEXT: cmgt v0.8h, v0.8h, v2.8h
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
@@ -181,31 +141,21 @@ define <16 x i16> @v16i16(<16 x i16> %x,
; CHECK-LABEL: v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v4.8h, v0.8h, v2.8h
-; CHECK-NEXT: cmlt v16.8h, v4.8h, #0
+; CHECK-NEXT: cmlt v7.8h, v4.8h, #0
; CHECK-NEXT: mvni v6.8h, #128, lsl #8
+; CHECK-NEXT: mvn v16.16b, v7.16b
+; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: sub v7.8h, v1.8h, v3.8h
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT: cmgt v2.8h, v2.8h, #0
+; CHECK-NEXT: cmgt v0.8h, v0.8h, v4.8h
; CHECK-NEXT: cmlt v16.8h, v7.8h, #0
; CHECK-NEXT: mvni v5.8h, #128, lsl #8
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT: cmge v2.8h, v2.8h, #0
-; CHECK-NEXT: cmge v0.8h, v0.8h, #0
-; CHECK-NEXT: cmge v16.8h, v4.8h, #0
-; CHECK-NEXT: cmge v3.8h, v3.8h, #0
-; CHECK-NEXT: cmge v1.8h, v1.8h, #0
-; CHECK-NEXT: cmeq v2.8h, v0.8h, v2.8h
-; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h
-; CHECK-NEXT: cmge v16.8h, v7.8h, #0
-; CHECK-NEXT: cmeq v3.8h, v1.8h, v3.8h
-; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: cmgt v3.8h, v3.8h, #0
+; CHECK-NEXT: cmgt v1.8h, v1.8h, v7.8h
+; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v16.16b
+; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@@ -228,46 +178,26 @@ define <32 x i16> @v32i16(<32 x i16> %x,
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.8h, v21.8h, #0
+; CHECK-NEXT: cmgt v4.8h, v4.8h, #0
+; CHECK-NEXT: cmgt v0.8h, v0.8h, v16.8h
; CHECK-NEXT: mvni v22.8h, #128, lsl #8
; CHECK-NEXT: sub v23.8h, v3.8h, v7.8h
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT: cmgt v4.8h, v5.8h, #0
+; CHECK-NEXT: cmgt v1.8h, v1.8h, v19.8h
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.8h, v23.8h, #0
+; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT: cmgt v4.8h, v6.8h, #0
+; CHECK-NEXT: cmgt v2.8h, v2.8h, v21.8h
; CHECK-NEXT: mvni v17.8h, #128, lsl #8
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: cmgt v4.8h, v7.8h, #0
+; CHECK-NEXT: cmgt v3.8h, v3.8h, v23.8h
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT: cmge v4.8h, v4.8h, #0
-; CHECK-NEXT: cmge v0.8h, v0.8h, #0
-; CHECK-NEXT: cmge v24.8h, v16.8h, #0
-; CHECK-NEXT: cmge v5.8h, v5.8h, #0
-; CHECK-NEXT: cmge v1.8h, v1.8h, #0
-; CHECK-NEXT: cmeq v4.8h, v0.8h, v4.8h
-; CHECK-NEXT: cmeq v0.8h, v0.8h, v24.8h
-; CHECK-NEXT: cmge v24.8h, v19.8h, #0
-; CHECK-NEXT: cmge v6.8h, v6.8h, #0
-; CHECK-NEXT: cmge v2.8h, v2.8h, #0
-; CHECK-NEXT: cmeq v5.8h, v1.8h, v5.8h
-; CHECK-NEXT: cmeq v1.8h, v1.8h, v24.8h
-; CHECK-NEXT: cmge v24.8h, v21.8h, #0
-; CHECK-NEXT: mvn v4.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: cmge v7.8h, v7.8h, #0
-; CHECK-NEXT: cmge v3.8h, v3.8h, #0
-; CHECK-NEXT: cmeq v6.8h, v2.8h, v6.8h
-; CHECK-NEXT: cmeq v2.8h, v2.8h, v24.8h
-; CHECK-NEXT: cmge v24.8h, v23.8h, #0
-; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: mvn v4.16b, v5.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: cmeq v7.8h, v3.8h, v7.8h
-; CHECK-NEXT: cmeq v3.8h, v3.8h, v24.8h
-; CHECK-NEXT: and v1.16b, v4.16b, v1.16b
-; CHECK-NEXT: mvn v4.16b, v6.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT: mvn v4.16b, v7.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: and v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@@ -284,17 +214,12 @@ define void @v8i8(<8 x i8>* %px, <8 x i8
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: movi v2.8b, #127
; CHECK-NEXT: sub v3.8b, v0.8b, v1.8b
-; CHECK-NEXT: cmge v1.8b, v1.8b, #0
-; CHECK-NEXT: cmge v0.8b, v0.8b, #0
-; CHECK-NEXT: cmge v5.8b, v3.8b, #0
; CHECK-NEXT: cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b
-; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b
+; CHECK-NEXT: cmgt v1.8b, v1.8b, #0
+; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v1.8b, v1.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
@@ -327,18 +252,13 @@ define void @v4i8(<4 x i8>* %px, <4 x i8
; CHECK-NEXT: shl v1.4h, v1.4h, #8
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmge v1.4h, v1.4h, #0
-; CHECK-NEXT: cmge v0.4h, v0.4h, #0
-; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
+; CHECK-NEXT: cmgt v1.4h, v1.4h, #0
+; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v1.8b, v1.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-NEXT: xtn v0.8b, v0.8h
@@ -365,18 +285,13 @@ define void @v2i8(<2 x i8>* %px, <2 x i8
; CHECK-NEXT: shl v2.2s, v2.2s, #24
; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: sub v3.2s, v0.2s, v2.2s
-; CHECK-NEXT: cmge v2.2s, v2.2s, #0
-; CHECK-NEXT: cmge v0.2s, v0.2s, #0
-; CHECK-NEXT: cmge v5.2s, v3.2s, #0
; CHECK-NEXT: cmlt v4.2s, v3.2s, #0
-; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s
-; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v1.2s, #128, lsl #24
+; CHECK-NEXT: cmgt v2.2s, v2.2s, #0
+; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v2.8b, v2.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b
; CHECK-NEXT: ushr v0.2s, v0.2s, #24
; CHECK-NEXT: mov w8, v0.s[1]
@@ -398,17 +313,12 @@ define void @v4i16(<4 x i16>* %px, <4 x
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmge v1.4h, v1.4h, #0
-; CHECK-NEXT: cmge v0.4h, v0.4h, #0
-; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
+; CHECK-NEXT: cmgt v1.4h, v1.4h, #0
+; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v1.8b, v1.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
@@ -433,18 +343,13 @@ define void @v2i16(<2 x i16>* %px, <2 x
; CHECK-NEXT: shl v2.2s, v2.2s, #16
; CHECK-NEXT: shl v0.2s, v0.2s, #16
; CHECK-NEXT: sub v3.2s, v0.2s, v2.2s
-; CHECK-NEXT: cmge v2.2s, v2.2s, #0
-; CHECK-NEXT: cmge v0.2s, v0.2s, #0
-; CHECK-NEXT: cmge v5.2s, v3.2s, #0
; CHECK-NEXT: cmlt v4.2s, v3.2s, #0
-; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s
-; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v1.2s, #128, lsl #24
+; CHECK-NEXT: cmgt v2.2s, v2.2s, #0
+; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v2.8b, v2.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b
; CHECK-NEXT: ushr v0.2s, v0.2s, #16
; CHECK-NEXT: mov w8, v0.s[1]
@@ -463,18 +368,13 @@ define <12 x i8> @v12i8(<12 x i8> %x, <1
; CHECK-LABEL: v12i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v5.16b, v2.16b, #0
; CHECK-NEXT: cmlt v4.16b, v2.16b, #0
-; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v3.16b, #127
+; CHECK-NEXT: cmgt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
@@ -489,29 +389,19 @@ define void @v12i16(<12 x i16>* %px, <12
; CHECK-NEXT: mvni v5.8h, #128, lsl #8
; CHECK-NEXT: mvni v4.8h, #128, lsl #8
; CHECK-NEXT: sub v6.8h, v1.8h, v2.8h
-; CHECK-NEXT: cmlt v16.8h, v6.8h, #0
+; CHECK-NEXT: cmlt v7.8h, v6.8h, #0
+; CHECK-NEXT: mvn v16.16b, v7.16b
+; CHECK-NEXT: bsl v5.16b, v7.16b, v16.16b
; CHECK-NEXT: sub v7.8h, v0.8h, v3.8h
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
+; CHECK-NEXT: cmgt v2.8h, v2.8h, #0
+; CHECK-NEXT: cmgt v1.8h, v1.8h, v6.8h
; CHECK-NEXT: cmlt v16.8h, v7.8h, #0
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v4.16b, v16.16b, v17.16b
-; CHECK-NEXT: cmge v2.8h, v2.8h, #0
-; CHECK-NEXT: cmge v1.8h, v1.8h, #0
-; CHECK-NEXT: cmge v16.8h, v6.8h, #0
-; CHECK-NEXT: cmge v3.8h, v3.8h, #0
-; CHECK-NEXT: cmge v0.8h, v0.8h, #0
-; CHECK-NEXT: cmeq v2.8h, v1.8h, v2.8h
-; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h
-; CHECK-NEXT: cmge v16.8h, v7.8h, #0
-; CHECK-NEXT: cmeq v3.8h, v0.8h, v3.8h
-; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: cmgt v3.8h, v3.8h, #0
+; CHECK-NEXT: cmgt v0.8h, v0.8h, v7.8h
+; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b
+; CHECK-NEXT: mvn v2.16b, v16.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: bsl v4.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v6.16b
; CHECK-NEXT: bsl v0.16b, v4.16b, v7.16b
; CHECK-NEXT: str q0, [x2]
@@ -531,17 +421,12 @@ define void @v1i8(<1 x i8>* %px, <1 x i8
; CHECK-NEXT: ldr b1, [x1]
; CHECK-NEXT: movi v2.8b, #127
; CHECK-NEXT: sub v3.8b, v0.8b, v1.8b
-; CHECK-NEXT: cmge v1.8b, v1.8b, #0
-; CHECK-NEXT: cmge v0.8b, v0.8b, #0
-; CHECK-NEXT: cmge v5.8b, v3.8b, #0
; CHECK-NEXT: cmlt v4.8b, v3.8b, #0
-; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b
-; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b
+; CHECK-NEXT: cmgt v1.8b, v1.8b, #0
+; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v1.8b, v1.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: st1 { v0.b }[0], [x2]
; CHECK-NEXT: ret
@@ -559,17 +444,12 @@ define void @v1i16(<1 x i16>* %px, <1 x
; CHECK-NEXT: ldr h1, [x1]
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmge v1.4h, v1.4h, #0
-; CHECK-NEXT: cmge v0.4h, v0.4h, #0
-; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
-; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
+; CHECK-NEXT: cmgt v1.4h, v1.4h, #0
+; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v1.8b, v1.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str h0, [x2]
; CHECK-NEXT: ret
@@ -586,18 +466,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <1
; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: shl v0.16b, v0.16b, #4
; CHECK-NEXT: sub v3.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v5.16b, v3.16b, #0
; CHECK-NEXT: cmlt v4.16b, v3.16b, #0
-; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v2.16b, #127
+; CHECK-NEXT: cmgt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
; CHECK-NEXT: sshr v0.16b, v0.16b, #4
; CHECK-NEXT: ret
@@ -611,18 +486,13 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: sub v3.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmge v1.16b, v1.16b, #0
-; CHECK-NEXT: cmge v0.16b, v0.16b, #0
-; CHECK-NEXT: cmge v5.16b, v3.16b, #0
; CHECK-NEXT: cmlt v4.16b, v3.16b, #0
-; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v2.16b, #127
+; CHECK-NEXT: cmgt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
; CHECK-NEXT: sshr v0.16b, v0.16b, #7
; CHECK-NEXT: ret
@@ -634,18 +504,13 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2
; CHECK-LABEL: v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.2s, v0.2s, v1.2s
-; CHECK-NEXT: cmge v1.2s, v1.2s, #0
-; CHECK-NEXT: cmge v0.2s, v0.2s, #0
-; CHECK-NEXT: cmge v5.2s, v2.2s, #0
; CHECK-NEXT: cmlt v4.2s, v2.2s, #0
-; CHECK-NEXT: cmeq v1.2s, v0.2s, v1.2s
-; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v3.2s, #128, lsl #24
+; CHECK-NEXT: cmgt v1.2s, v1.2s, #0
+; CHECK-NEXT: cmgt v0.2s, v0.2s, v2.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
-; CHECK-NEXT: mvn v1.8b, v1.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v3.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v3.8b, v2.8b
; CHECK-NEXT: ret
%z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@@ -656,18 +521,13 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4
; CHECK-LABEL: v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmge v1.4s, v1.4s, #0
-; CHECK-NEXT: cmge v0.4s, v0.4s, #0
-; CHECK-NEXT: cmge v5.4s, v2.4s, #0
; CHECK-NEXT: cmlt v4.4s, v2.4s, #0
-; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
; CHECK-NEXT: mvni v3.4s, #128, lsl #24
+; CHECK-NEXT: cmgt v1.4s, v1.4s, #0
+; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s
; CHECK-NEXT: mvn v5.16b, v4.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@@ -678,31 +538,21 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8
; CHECK-LABEL: v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v4.4s, v0.4s, v2.4s
-; CHECK-NEXT: cmlt v16.4s, v4.4s, #0
+; CHECK-NEXT: cmlt v7.4s, v4.4s, #0
; CHECK-NEXT: mvni v6.4s, #128, lsl #24
+; CHECK-NEXT: mvn v16.16b, v7.16b
+; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: sub v7.4s, v1.4s, v3.4s
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
+; CHECK-NEXT: cmgt v2.4s, v2.4s, #0
+; CHECK-NEXT: cmgt v0.4s, v0.4s, v4.4s
; CHECK-NEXT: cmlt v16.4s, v7.4s, #0
; CHECK-NEXT: mvni v5.4s, #128, lsl #24
-; CHECK-NEXT: mvn v17.16b, v16.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
-; CHECK-NEXT: cmge v2.4s, v2.4s, #0
-; CHECK-NEXT: cmge v0.4s, v0.4s, #0
-; CHECK-NEXT: cmge v16.4s, v4.4s, #0
-; CHECK-NEXT: cmge v3.4s, v3.4s, #0
-; CHECK-NEXT: cmge v1.4s, v1.4s, #0
-; CHECK-NEXT: cmeq v2.4s, v0.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v16.4s
-; CHECK-NEXT: cmge v16.4s, v7.4s, #0
-; CHECK-NEXT: cmeq v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v1.4s, v1.4s, v16.4s
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: cmgt v3.4s, v3.4s, #0
+; CHECK-NEXT: cmgt v1.4s, v1.4s, v7.4s
+; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v16.16b
+; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@@ -725,46 +575,26 @@ define <16 x i32> @v16i32(<16 x i32> %x,
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.4s, v21.4s, #0
+; CHECK-NEXT: cmgt v4.4s, v4.4s, #0
+; CHECK-NEXT: cmgt v0.4s, v0.4s, v16.4s
; CHECK-NEXT: mvni v22.4s, #128, lsl #24
; CHECK-NEXT: sub v23.4s, v3.4s, v7.4s
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT: cmgt v4.4s, v5.4s, #0
+; CHECK-NEXT: cmgt v1.4s, v1.4s, v19.4s
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.4s, v23.4s, #0
+; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT: cmgt v4.4s, v6.4s, #0
+; CHECK-NEXT: cmgt v2.4s, v2.4s, v21.4s
; CHECK-NEXT: mvni v17.4s, #128, lsl #24
; CHECK-NEXT: mvn v25.16b, v24.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: cmgt v4.4s, v7.4s, #0
+; CHECK-NEXT: cmgt v3.4s, v3.4s, v23.4s
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
-; CHECK-NEXT: cmge v4.4s, v4.4s, #0
-; CHECK-NEXT: cmge v0.4s, v0.4s, #0
-; CHECK-NEXT: cmge v24.4s, v16.4s, #0
-; CHECK-NEXT: cmge v5.4s, v5.4s, #0
-; CHECK-NEXT: cmge v1.4s, v1.4s, #0
-; CHECK-NEXT: cmeq v4.4s, v0.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v24.4s
-; CHECK-NEXT: cmge v24.4s, v19.4s, #0
-; CHECK-NEXT: cmge v6.4s, v6.4s, #0
-; CHECK-NEXT: cmge v2.4s, v2.4s, #0
-; CHECK-NEXT: cmeq v5.4s, v1.4s, v5.4s
-; CHECK-NEXT: cmeq v1.4s, v1.4s, v24.4s
-; CHECK-NEXT: cmge v24.4s, v21.4s, #0
-; CHECK-NEXT: mvn v4.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: cmge v7.4s, v7.4s, #0
-; CHECK-NEXT: cmge v3.4s, v3.4s, #0
-; CHECK-NEXT: cmeq v6.4s, v2.4s, v6.4s
-; CHECK-NEXT: cmeq v2.4s, v2.4s, v24.4s
-; CHECK-NEXT: cmge v24.4s, v23.4s, #0
-; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: mvn v4.16b, v5.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: cmeq v7.4s, v3.4s, v7.4s
-; CHECK-NEXT: cmeq v3.4s, v3.4s, v24.4s
-; CHECK-NEXT: and v1.16b, v4.16b, v1.16b
-; CHECK-NEXT: mvn v4.16b, v6.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT: mvn v4.16b, v7.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: and v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@@ -778,19 +608,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
; CHECK-LABEL: v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.2d, v0.2d, v1.2d
-; CHECK-NEXT: cmge v1.2d, v1.2d, #0
-; CHECK-NEXT: cmge v0.2d, v0.2d, #0
-; CHECK-NEXT: cmge v5.2d, v2.2d, #0
; CHECK-NEXT: mov x8, #9223372036854775807
; CHECK-NEXT: cmlt v3.2d, v2.2d, #0
-; CHECK-NEXT: cmeq v1.2d, v0.2d, v1.2d
-; CHECK-NEXT: cmeq v0.2d, v0.2d, v5.2d
+; CHECK-NEXT: cmgt v1.2d, v1.2d, #0
; CHECK-NEXT: dup v4.2d, x8
+; CHECK-NEXT: cmgt v0.2d, v0.2d, v2.2d
; CHECK-NEXT: mvn v5.16b, v3.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v4.16b, v3.16b, v5.16b
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b
; CHECK-NEXT: ret
%z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
@@ -802,33 +627,23 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4
; CHECK: // %bb.0:
; CHECK-NEXT: sub v4.2d, v0.2d, v2.2d
; CHECK-NEXT: mov x8, #9223372036854775807
-; CHECK-NEXT: cmlt v6.2d, v4.2d, #0
-; CHECK-NEXT: dup v7.2d, x8
+; CHECK-NEXT: cmlt v5.2d, v4.2d, #0
+; CHECK-NEXT: dup v6.2d, x8
+; CHECK-NEXT: mvn v7.16b, v5.16b
+; CHECK-NEXT: mov v16.16b, v6.16b
+; CHECK-NEXT: bsl v16.16b, v5.16b, v7.16b
; CHECK-NEXT: sub v5.2d, v1.2d, v3.2d
-; CHECK-NEXT: mvn v16.16b, v6.16b
-; CHECK-NEXT: mov v17.16b, v7.16b
-; CHECK-NEXT: bsl v17.16b, v6.16b, v16.16b
-; CHECK-NEXT: cmlt v6.2d, v5.2d, #0
-; CHECK-NEXT: mvn v16.16b, v6.16b
-; CHECK-NEXT: bsl v7.16b, v6.16b, v16.16b
-; CHECK-NEXT: cmge v2.2d, v2.2d, #0
-; CHECK-NEXT: cmge v0.2d, v0.2d, #0
-; CHECK-NEXT: cmge v6.2d, v4.2d, #0
-; CHECK-NEXT: cmge v3.2d, v3.2d, #0
-; CHECK-NEXT: cmge v1.2d, v1.2d, #0
-; CHECK-NEXT: cmeq v2.2d, v0.2d, v2.2d
-; CHECK-NEXT: cmeq v0.2d, v0.2d, v6.2d
-; CHECK-NEXT: cmge v6.2d, v5.2d, #0
-; CHECK-NEXT: cmeq v3.2d, v1.2d, v3.2d
-; CHECK-NEXT: cmeq v1.2d, v1.2d, v6.2d
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT: bsl v0.16b, v17.16b, v4.16b
-; CHECK-NEXT: bsl v1.16b, v7.16b, v5.16b
+; CHECK-NEXT: cmgt v2.2d, v2.2d, #0
+; CHECK-NEXT: cmgt v0.2d, v0.2d, v4.2d
+; CHECK-NEXT: cmlt v7.2d, v5.2d, #0
+; CHECK-NEXT: cmgt v3.2d, v3.2d, #0
+; CHECK-NEXT: cmgt v1.2d, v1.2d, v5.2d
+; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v7.16b
+; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bsl v6.16b, v7.16b, v2.16b
+; CHECK-NEXT: bsl v0.16b, v16.16b, v4.16b
+; CHECK-NEXT: bsl v1.16b, v6.16b, v5.16b
; CHECK-NEXT: ret
%z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
ret <4 x i64> %z
@@ -850,46 +665,26 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
; CHECK-NEXT: bsl v25.16b, v20.16b, v24.16b
; CHECK-NEXT: mvn v20.16b, v22.16b
; CHECK-NEXT: mov v24.16b, v21.16b
+; CHECK-NEXT: cmgt v4.2d, v4.2d, #0
+; CHECK-NEXT: cmgt v0.2d, v0.2d, v16.2d
; CHECK-NEXT: sub v19.2d, v3.2d, v7.2d
; CHECK-NEXT: bsl v24.16b, v22.16b, v20.16b
; CHECK-NEXT: mvn v20.16b, v23.16b
; CHECK-NEXT: mov v22.16b, v21.16b
+; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT: cmgt v4.2d, v5.2d, #0
+; CHECK-NEXT: cmgt v1.2d, v1.2d, v17.2d
; CHECK-NEXT: bsl v22.16b, v23.16b, v20.16b
; CHECK-NEXT: cmlt v20.2d, v19.2d, #0
+; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT: cmgt v4.2d, v6.2d, #0
+; CHECK-NEXT: cmgt v2.2d, v2.2d, v18.2d
; CHECK-NEXT: mvn v23.16b, v20.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: cmgt v4.2d, v7.2d, #0
+; CHECK-NEXT: cmgt v3.2d, v3.2d, v19.2d
; CHECK-NEXT: bsl v21.16b, v20.16b, v23.16b
-; CHECK-NEXT: cmge v4.2d, v4.2d, #0
-; CHECK-NEXT: cmge v0.2d, v0.2d, #0
-; CHECK-NEXT: cmge v20.2d, v16.2d, #0
-; CHECK-NEXT: cmge v5.2d, v5.2d, #0
-; CHECK-NEXT: cmge v1.2d, v1.2d, #0
-; CHECK-NEXT: cmeq v4.2d, v0.2d, v4.2d
-; CHECK-NEXT: cmeq v0.2d, v0.2d, v20.2d
-; CHECK-NEXT: cmge v20.2d, v17.2d, #0
-; CHECK-NEXT: cmge v6.2d, v6.2d, #0
-; CHECK-NEXT: cmge v2.2d, v2.2d, #0
-; CHECK-NEXT: cmeq v5.2d, v1.2d, v5.2d
-; CHECK-NEXT: cmeq v1.2d, v1.2d, v20.2d
-; CHECK-NEXT: cmge v20.2d, v18.2d, #0
-; CHECK-NEXT: mvn v4.16b, v4.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: cmge v7.2d, v7.2d, #0
-; CHECK-NEXT: cmge v3.2d, v3.2d, #0
-; CHECK-NEXT: cmeq v6.2d, v2.2d, v6.2d
-; CHECK-NEXT: cmeq v2.2d, v2.2d, v20.2d
-; CHECK-NEXT: cmge v20.2d, v19.2d, #0
-; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: mvn v4.16b, v5.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: cmeq v7.2d, v3.2d, v7.2d
-; CHECK-NEXT: cmeq v3.2d, v3.2d, v20.2d
-; CHECK-NEXT: and v1.16b, v4.16b, v1.16b
-; CHECK-NEXT: mvn v4.16b, v6.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
-; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT: mvn v4.16b, v7.16b
-; CHECK-NEXT: mvn v3.16b, v3.16b
-; CHECK-NEXT: and v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v25.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v24.16b, v17.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v18.16b
Modified: llvm/trunk/test/CodeGen/AMDGPU/saddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/saddo.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/saddo.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/saddo.ll Mon Sep 30 00:58:50 2019
@@ -13,29 +13,25 @@ declare { <2 x i32>, <2 x i1> } @llvm.sa
define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
; SI-LABEL: saddo_i64_zext:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s8
-; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
-; SI-NEXT: s_mov_b32 s5, s9
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; SI-NEXT: s_add_u32 s2, s10, s0
-; SI-NEXT: s_addc_u32 s3, s11, s1
-; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0
-; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: s_add_u32 s10, s6, s8
+; SI-NEXT: s_addc_u32 s11, s7, s9
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: v_mov_b32_e32 v1, s11
+; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: saddo_i64_zext:
@@ -43,22 +39,18 @@ define amdgpu_kernel void @saddo_i64_zex
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: s_add_u32 s8, s6, s0
+; VI-NEXT: s_addc_u32 s9, s7, s1
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2]
+; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
-; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3]
-; VI-NEXT: s_add_u32 s2, s6, s0
-; VI-NEXT: s_addc_u32 s3, s7, s1
-; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v2
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -68,22 +60,18 @@ define amdgpu_kernel void @saddo_i64_zex
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_add_u32 s8, s6, s0
+; GFX9-NEXT: s_addc_u32 s9, s7, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s9
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3]
-; GFX9-NEXT: s_add_u32 s2, s6, s0
-; GFX9-NEXT: s_addc_u32 s3, s7, s1
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v2
-; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
@@ -99,32 +87,27 @@ define amdgpu_kernel void @saddo_i64_zex
define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
; SI-LABEL: s_saddo_i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s8
-; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; SI-NEXT: s_add_i32 s2, s0, s1
-; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0
-; SI-NEXT: s_mov_b32 s5, s9
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: s_mov_b32 s8, s10
-; SI-NEXT: s_mov_b32 s9, s11
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: v_cmp_lt_i32_e64 s[10:11], s9, 0
+; SI-NEXT: s_add_i32 s9, s8, s9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, s9, v0
+; SI-NEXT: v_mov_b32_e32 v0, s9
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_xor_b64 s[0:1], s[10:11], vcc
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_saddo_i32:
@@ -133,18 +116,13 @@ define amdgpu_kernel void @s_saddo_i32(i
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1
-; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3]
-; VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1
-; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
-; VI-NEXT: s_add_i32 s2, s0, s1
-; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
+; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0
+; VI-NEXT: s_add_i32 s1, s0, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4
+; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
@@ -158,18 +136,13 @@ define amdgpu_kernel void @s_saddo_i32(i
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3]
-; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
-; GFX9-NEXT: s_add_i32 s2, s0, s1
-; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
+; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0
+; GFX9-NEXT: s_add_i32 s1, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; GFX9-NEXT: global_store_dword v[0:1], v4, off
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
@@ -204,19 +177,12 @@ define amdgpu_kernel void @v_saddo_i32(i
; SI-NEXT: s_mov_b32 s6, s14
; SI-NEXT: s_mov_b32 s7, s15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v0
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
-; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v1
-; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
; SI-NEXT: s_endpgm
;
@@ -235,17 +201,11 @@ define amdgpu_kernel void @v_saddo_i32(i
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
-; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
-; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
-; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
-; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v7, v5
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT: flat_store_dword v[2:3], v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v6
+; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4
+; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: flat_store_dword v[2:3], v5
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
@@ -265,17 +225,11 @@ define amdgpu_kernel void @v_saddo_i32(i
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
-; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v7, v5
-; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: global_store_dword v[2:3], v4, off
+; GFX9-NEXT: v_add_u32_e32 v5, v6, v4
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6
+; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: global_store_dword v[2:3], v5, off
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
@@ -292,31 +246,27 @@ define amdgpu_kernel void @v_saddo_i32(i
define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
; SI-LABEL: s_saddo_i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s15, 0xf000
-; SI-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], -1
-; SI-NEXT: s_add_u32 s2, s8, s10
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[8:9], -1
-; SI-NEXT: s_addc_u32 s3, s9, s11
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: s_mov_b32 s12, s6
-; SI-NEXT: s_mov_b32 s13, s7
-; SI-NEXT: s_mov_b32 s6, s14
-; SI-NEXT: s_mov_b32 s7, s15
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_add_u32 s12, s4, s6
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_addc_u32 s13, s5, s7
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: v_mov_b32_e32 v1, s13
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_xor_b64 s[0:1], s[4:5], vcc
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_saddo_i64:
@@ -324,22 +274,18 @@ define amdgpu_kernel void @s_saddo_i64(i
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
+; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
-; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_addc_u32 s3, s5, s7
-; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -350,22 +296,18 @@ define amdgpu_kernel void @s_saddo_i64(i
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_add_u32 s0, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
+; GFX9-NEXT: s_addc_u32 s1, s5, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
-; GFX9-NEXT: s_add_u32 s2, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_addc_u32 s3, s5, s7
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
-; GFX9-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v[0:1], v2, off
@@ -398,19 +340,12 @@ define amdgpu_kernel void @v_saddo_i64(i
; SI-NEXT: s_mov_b32 s6, s14
; SI-NEXT: s_mov_b32 s7, s15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3]
-; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[0:1]
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v2
-; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
+; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
+; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
; SI-NEXT: s_endpgm
@@ -430,18 +365,12 @@ define amdgpu_kernel void @v_saddo_i64(i
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
-; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
-; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v9, v6
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4
+; VI-NEXT: v_addc_u32_e32 v9, vcc, v7, v5, vcc
+; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7]
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9]
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
@@ -461,18 +390,12 @@ define amdgpu_kernel void @v_saddo_i64(i
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v5, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v9, v6
-; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v6, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v5, vcc
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7]
+; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off
+; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
@@ -489,48 +412,35 @@ define amdgpu_kernel void @v_saddo_i64(i
define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
; SI-LABEL: v_saddo_v2i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s15, 0xf000
-; SI-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s2, s14
-; SI-NEXT: s_mov_b32 s3, s15
+; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s19, 0xf000
+; SI-NEXT: s_mov_b32 s18, -1
+; SI-NEXT: s_mov_b32 s2, s18
+; SI-NEXT: s_mov_b32 s3, s19
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s10
-; SI-NEXT: s_mov_b32 s1, s11
-; SI-NEXT: s_mov_b32 s10, s14
-; SI-NEXT: s_mov_b32 s11, s15
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s0, s14
+; SI-NEXT: s_mov_b32 s1, s15
+; SI-NEXT: s_mov_b32 s14, s18
+; SI-NEXT: s_mov_b32 s15, s19
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
-; SI-NEXT: s_mov_b32 s12, s6
-; SI-NEXT: s_mov_b32 s13, s7
-; SI-NEXT: s_mov_b32 s6, s14
-; SI-NEXT: s_mov_b32 s7, s15
+; SI-NEXT: s_mov_b32 s16, s10
+; SI-NEXT: s_mov_b32 s17, s11
+; SI-NEXT: s_mov_b32 s10, s18
+; SI-NEXT: s_mov_b32 s11, s19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v0
-; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v1
-; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v6, v2
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], v6, v2
-; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], v5, v3
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3
+; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
+; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
+; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
+; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
+; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT: s_and_b64 s[0:1], vcc, s[2:3]
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_saddo_v2i32:
@@ -543,33 +453,21 @@ define amdgpu_kernel void @v_saddo_v2i32
; VI-NEXT: v_mov_b32_e32 v7, s5
; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
-; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
-; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4
-; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
-; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7
-; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; VI-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
-; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v5
-; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], v10, v6
-; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], v10, v6
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
-; VI-NEXT: v_cmp_ne_u32_e64 s[2:3], v9, v7
-; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: v_add_u32_e32 v9, vcc, v7, v5
+; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4
+; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5
+; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7
+; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4
+; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9]
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[2:3]
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -584,33 +482,21 @@ define amdgpu_kernel void @v_saddo_v2i32
; GFX9-NEXT: v_mov_b32_e32 v7, s5
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
-; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
-; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], v10, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], v10, v6
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], v9, v7
-; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
+; GFX9-NEXT: v_add_u32_e32 v9, v7, v5
+; GFX9-NEXT: v_add_u32_e32 v8, v6, v4
+; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5
+; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6
+; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[2:3]
+; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
Modified: llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/addsubo-legalization.ll Mon Sep 30 00:58:50 2019
@@ -95,76 +95,48 @@ define <2 x i1> @usubo(<2 x i64> *%ptr,
define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
; CHECK-LABEL: saddo:
; CHECK: @ %bb.0:
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
; CHECK-NEXT: movs r3, #0
-; CHECK-NEXT: vmov.32 r1, d16[1]
-; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
-; CHECK-NEXT: vmov.32 r2, d17[1]
-; CHECK-NEXT: vadd.i64 q8, q9, q8
-; CHECK-NEXT: vmov.32 r12, d18[1]
-; CHECK-NEXT: vmov.32 r4, d19[1]
-; CHECK-NEXT: vmov.32 lr, d16[1]
-; CHECK-NEXT: vmov.32 r7, d17[1]
-; CHECK-NEXT: cmp.w r1, #-1
+; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
+; CHECK-NEXT: vadd.i64 q8, q10, q9
+; CHECK-NEXT: vmov.32 r2, d20[0]
+; CHECK-NEXT: vmov.32 r1, d20[1]
+; CHECK-NEXT: vmov.32 r12, d16[0]
+; CHECK-NEXT: vmov.32 r8, d16[1]
+; CHECK-NEXT: vmov.32 lr, d17[0]
+; CHECK-NEXT: vmov.32 r4, d21[0]
+; CHECK-NEXT: vmov.32 r5, d17[1]
+; CHECK-NEXT: vmov.32 r6, d18[1]
+; CHECK-NEXT: vmov.32 r7, d21[1]
+; CHECK-NEXT: subs.w r2, r12, r2
+; CHECK-NEXT: vmov.32 r2, d19[1]
+; CHECK-NEXT: sbcs.w r1, r8, r1
; CHECK-NEXT: mov.w r1, #0
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r1, #1
-; CHECK-NEXT: cmp r1, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r1, #-1
-; CHECK-NEXT: cmp.w r2, #-1
-; CHECK-NEXT: mov.w r2, #0
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r2, #1
-; CHECK-NEXT: cmp.w r12, #-1
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r5, #1
-; CHECK-NEXT: cmp r5, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r5, #-1
-; CHECK-NEXT: cmp.w r4, #-1
-; CHECK-NEXT: mov.w r4, #0
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r4, #1
-; CHECK-NEXT: cmp.w lr, #-1
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r6, #1
-; CHECK-NEXT: cmp r6, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r6, #-1
-; CHECK-NEXT: cmp.w r7, #-1
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r3, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: movlt r1, #1
+; CHECK-NEXT: subs.w r4, lr, r4
+; CHECK-NEXT: sbcs.w r7, r5, r7
+; CHECK-NEXT: it lt
+; CHECK-NEXT: movlt r3, #1
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r3, #-1
-; CHECK-NEXT: cmp r4, #0
-; CHECK-NEXT: vdup.32 d19, r3
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r4, #-1
-; CHECK-NEXT: cmp r2, #0
+; CHECK-NEXT: asrs r7, r6, #31
+; CHECK-NEXT: vdup.32 d21, r3
+; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r2, #-1
-; CHECK-NEXT: vdup.32 d23, r2
-; CHECK-NEXT: vdup.32 d21, r4
-; CHECK-NEXT: vdup.32 d18, r6
-; CHECK-NEXT: vdup.32 d22, r1
-; CHECK-NEXT: vdup.32 d20, r5
-; CHECK-NEXT: vceq.i32 q9, q10, q9
+; CHECK-NEXT: movne.w r1, #-1
+; CHECK-NEXT: vdup.32 d20, r1
; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
-; CHECK-NEXT: vceq.i32 q10, q10, q11
-; CHECK-NEXT: vrev64.32 q11, q9
-; CHECK-NEXT: vrev64.32 q12, q10
-; CHECK-NEXT: vand q9, q9, q11
-; CHECK-NEXT: vand q10, q10, q12
-; CHECK-NEXT: vbic q9, q10, q9
+; CHECK-NEXT: asrs r2, r2, #31
+; CHECK-NEXT: vdup.32 d19, r2
+; CHECK-NEXT: vdup.32 d18, r7
+; CHECK-NEXT: veor q9, q9, q10
; CHECK-NEXT: vmovn.i64 d18, q9
; CHECK-NEXT: vmov r2, r1, d18
; CHECK-NEXT: mov r0, r2
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
%x = load <2 x i64>, <2 x i64>* %ptr, align 8
%y = load <2 x i64>, <2 x i64>* %ptr2, align 8
%s = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)
@@ -177,77 +149,64 @@ define <2 x i1> @saddo(<2 x i64> *%ptr,
define <2 x i1> @ssubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
; CHECK-LABEL: ssubo:
; CHECK: @ %bb.0:
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
; CHECK-NEXT: vsub.i64 q8, q10, q9
+; CHECK-NEXT: vmov.32 r1, d20[0]
; CHECK-NEXT: vmov.32 r12, d20[1]
-; CHECK-NEXT: vmov.32 lr, d21[1]
-; CHECK-NEXT: vmov.32 r1, d16[1]
-; CHECK-NEXT: vmov.32 r2, d17[1]
-; CHECK-NEXT: vmov.32 r4, d18[1]
-; CHECK-NEXT: vmov.32 r7, d19[1]
-; CHECK-NEXT: cmp.w r1, #-1
+; CHECK-NEXT: vmov.32 r3, d16[0]
+; CHECK-NEXT: vmov.32 lr, d16[1]
+; CHECK-NEXT: vmov.32 r4, d21[0]
+; CHECK-NEXT: vmov.32 r5, d17[0]
+; CHECK-NEXT: vmov.32 r6, d21[1]
+; CHECK-NEXT: vmov.32 r7, d17[1]
+; CHECK-NEXT: vmov.32 r8, d18[1]
+; CHECK-NEXT: subs r1, r3, r1
+; CHECK-NEXT: vmov.32 r3, d18[0]
+; CHECK-NEXT: sbcs.w r1, lr, r12
+; CHECK-NEXT: vmov.32 r12, d19[0]
; CHECK-NEXT: mov.w r1, #0
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r1, #1
-; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: it lt
+; CHECK-NEXT: movlt r1, #1
+; CHECK-NEXT: subs r5, r5, r4
+; CHECK-NEXT: vmov.32 r5, d19[1]
+; CHECK-NEXT: sbcs r7, r6
+; CHECK-NEXT: mov.w r7, #0
+; CHECK-NEXT: it lt
+; CHECK-NEXT: movlt r7, #1
+; CHECK-NEXT: cmp r7, #0
+; CHECK-NEXT: it ne
+; CHECK-NEXT: movne.w r7, #-1
+; CHECK-NEXT: vdup.32 d21, r7
+; CHECK-NEXT: rsbs r3, r3, #0
+; CHECK-NEXT: sbcs.w r3, r2, r8
+; CHECK-NEXT: mov.w r3, #0
+; CHECK-NEXT: it lt
+; CHECK-NEXT: movlt r3, #1
+; CHECK-NEXT: rsbs.w r6, r12, #0
+; CHECK-NEXT: sbcs.w r6, r2, r5
+; CHECK-NEXT: it lt
+; CHECK-NEXT: movlt r2, #1
+; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r1, #-1
-; CHECK-NEXT: cmp.w r2, #-1
-; CHECK-NEXT: mov.w r2, #0
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r2, #1
-; CHECK-NEXT: cmp.w r12, #-1
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r5, #1
-; CHECK-NEXT: cmp r5, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r5, #-1
-; CHECK-NEXT: cmp.w lr, #-1
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r6, #1
-; CHECK-NEXT: cmp.w r4, #-1
-; CHECK-NEXT: mov.w r4, #0
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r4, #1
-; CHECK-NEXT: cmp r4, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r4, #-1
-; CHECK-NEXT: cmp.w r7, #-1
-; CHECK-NEXT: it gt
-; CHECK-NEXT: movgt r3, #1
+; CHECK-NEXT: movne.w r2, #-1
; CHECK-NEXT: cmp r3, #0
+; CHECK-NEXT: vdup.32 d19, r2
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r3, #-1
-; CHECK-NEXT: vdup.32 d19, r3
-; CHECK-NEXT: cmp r6, #0
-; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r6, #-1
-; CHECK-NEXT: vdup.32 d21, r6
-; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: vdup.32 d18, r4
+; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: it ne
-; CHECK-NEXT: movne.w r2, #-1
-; CHECK-NEXT: vdup.32 d23, r2
-; CHECK-NEXT: vdup.32 d20, r5
-; CHECK-NEXT: vdup.32 d22, r1
-; CHECK-NEXT: vceq.i32 q9, q10, q9
+; CHECK-NEXT: movne.w r1, #-1
+; CHECK-NEXT: vdup.32 d18, r3
+; CHECK-NEXT: vdup.32 d20, r1
+; CHECK-NEXT: veor q9, q9, q10
; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
-; CHECK-NEXT: vceq.i32 q10, q10, q11
-; CHECK-NEXT: vrev64.32 q11, q9
-; CHECK-NEXT: vrev64.32 q12, q10
-; CHECK-NEXT: vand q9, q9, q11
-; CHECK-NEXT: vand q10, q10, q12
-; CHECK-NEXT: vmvn q9, q9
-; CHECK-NEXT: vbic q9, q9, q10
; CHECK-NEXT: vmovn.i64 d18, q9
; CHECK-NEXT: vmov r2, r1, d18
; CHECK-NEXT: mov r0, r2
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
%x = load <2 x i64>, <2 x i64>* %ptr, align 8
%y = load <2 x i64>, <2 x i64>* %ptr2, align 8
%s = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)
Modified: llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll (original)
+++ llvm/trunk/test/CodeGen/RISCV/arith-with-overflow.ll Mon Sep 30 00:58:50 2019
@@ -10,17 +10,11 @@ declare {i32, i1} @llvm.usub.with.overfl
define i1 @sadd(i32 %a, i32 %b, i32* %c) nounwind {
; RV32I-LABEL: sadd:
; RV32I: # %bb.0: # %entry
-; RV32I-NEXT: addi a3, zero, -1
-; RV32I-NEXT: slt a4, a3, a1
-; RV32I-NEXT: slt a5, a3, a0
-; RV32I-NEXT: xor a4, a5, a4
-; RV32I-NEXT: seqz a4, a4
-; RV32I-NEXT: add a1, a0, a1
-; RV32I-NEXT: slt a0, a3, a1
-; RV32I-NEXT: xor a0, a5, a0
-; RV32I-NEXT: snez a0, a0
-; RV32I-NEXT: and a0, a4, a0
-; RV32I-NEXT: sw a1, 0(a2)
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: slt a0, a3, a0
+; RV32I-NEXT: slti a1, a1, 0
+; RV32I-NEXT: xor a0, a1, a0
+; RV32I-NEXT: sw a3, 0(a2)
; RV32I-NEXT: ret
entry:
%x = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@@ -33,16 +27,10 @@ entry:
define i1 @ssub(i32 %a, i32 %b, i32* %c) nounwind {
; RV32I-LABEL: ssub:
; RV32I: # %bb.0: # %entry
-; RV32I-NEXT: addi a3, zero, -1
-; RV32I-NEXT: slt a4, a3, a1
-; RV32I-NEXT: slt a5, a3, a0
-; RV32I-NEXT: xor a4, a5, a4
-; RV32I-NEXT: snez a4, a4
+; RV32I-NEXT: sgtz a3, a1
; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: slt a0, a3, a1
-; RV32I-NEXT: xor a0, a5, a0
-; RV32I-NEXT: snez a0, a0
-; RV32I-NEXT: and a0, a4, a0
+; RV32I-NEXT: slt a0, a1, a0
+; RV32I-NEXT: xor a0, a3, a0
; RV32I-NEXT: sw a1, 0(a2)
; RV32I-NEXT: ret
entry:
Modified: llvm/trunk/test/CodeGen/X86/combine-mulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-mulo.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-mulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-mulo.ll Mon Sep 30 00:58:50 2019
@@ -34,30 +34,21 @@ define <4 x i32> @combine_vec_smul_two(<
; SSE-LABEL: combine_vec_smul_two:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: paddd %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: paddd %xmm2, %xmm2
-; SSE-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm0, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_smul_two:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm3
-; AVX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm3
+; AVX-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq
%1 = call {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
%2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
Modified: llvm/trunk/test/CodeGen/X86/mulo-pow2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mulo-pow2.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mulo-pow2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mulo-pow2.ll Mon Sep 30 00:58:50 2019
@@ -98,15 +98,10 @@ define <4 x i32> @smul_v4i32_1(<4 x i32>
define <4 x i32> @smul_v4i32_2(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: smul_v4i32_2:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm3
-; AVX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm3
+; AVX-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq
%x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
%y = extractvalue { <4 x i32>, <4 x i1> } %x, 0
Modified: llvm/trunk/test/CodeGen/X86/sadd_sat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sadd_sat.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sadd_sat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sadd_sat.ll Mon Sep 30 00:58:50 2019
@@ -183,28 +183,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x
;
; X64-LABEL: vec:
; X64: # %bb.0:
+; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: pxor %xmm3, %xmm3
-; X64-NEXT: pxor %xmm4, %xmm4
-; X64-NEXT: pcmpgtd %xmm1, %xmm4
-; X64-NEXT: pcmpeqd %xmm2, %xmm2
-; X64-NEXT: pxor %xmm2, %xmm4
-; X64-NEXT: pxor %xmm5, %xmm5
-; X64-NEXT: pcmpgtd %xmm0, %xmm5
-; X64-NEXT: pxor %xmm2, %xmm5
-; X64-NEXT: pcmpeqd %xmm5, %xmm4
-; X64-NEXT: paddd %xmm1, %xmm0
-; X64-NEXT: pcmpgtd %xmm0, %xmm3
-; X64-NEXT: pxor %xmm3, %xmm2
-; X64-NEXT: pcmpeqd %xmm5, %xmm2
-; X64-NEXT: pandn %xmm4, %xmm2
-; X64-NEXT: movdqa %xmm3, %xmm1
-; X64-NEXT: pandn {{.*}}(%rip), %xmm1
-; X64-NEXT: psrld $1, %xmm3
-; X64-NEXT: por %xmm1, %xmm3
-; X64-NEXT: pand %xmm2, %xmm3
-; X64-NEXT: pandn %xmm0, %xmm2
+; X64-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-NEXT: paddd %xmm0, %xmm1
+; X64-NEXT: pcmpgtd %xmm1, %xmm0
+; X64-NEXT: pxor %xmm3, %xmm0
+; X64-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm3
+; X64-NEXT: pandn {{.*}}(%rip), %xmm3
+; X64-NEXT: psrld $1, %xmm2
; X64-NEXT: por %xmm3, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: pand %xmm0, %xmm2
+; X64-NEXT: pandn %xmm1, %xmm0
+; X64-NEXT: por %xmm2, %xmm0
; X64-NEXT: retq
%tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
ret <4 x i32> %tmp;
Modified: llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll Mon Sep 30 00:58:50 2019
@@ -598,133 +598,88 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm2, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT: psrld $1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm2, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: paddd %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT: psrld $1, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT: paddd %xmm0, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm3
+; SSSE3-NEXT: psrld $1, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT: paddd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT: pandn %xmm4, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: paddd %xmm1, %xmm3
+; SSE41-NEXT: movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm4
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm2, {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovdqa %xmm1, %xmm0
; AVX512-NEXT: retq
%z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
@@ -733,133 +688,88 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2
define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-LABEL: v4i32:
; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm2, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT: psrld $1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i32:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm2, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: paddd %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT: psrld $1, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT: paddd %xmm0, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm3
+; SSSE3-NEXT: psrld $1, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT: paddd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT: pandn %xmm4, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: paddd %xmm1, %xmm3
+; SSE41-NEXT: movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm4
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm2, {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovdqa %xmm1, %xmm0
; AVX512-NEXT: retq
%z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %z
@@ -868,214 +778,135 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4
define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm8
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm5, %xmm7
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm7
-; SSE2-NEXT: paddd %xmm2, %xmm8
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm7, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pandn %xmm4, %xmm7
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm5, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: psrld $1, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm6, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm5
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm6, %xmm2
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm2
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm8
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm5, %xmm7
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7
-; SSSE3-NEXT: paddd %xmm2, %xmm8
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT: paddd %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
; SSSE3-NEXT: pxor %xmm5, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm7, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm2, %xmm7
-; SSSE3-NEXT: pandn %xmm4, %xmm7
-; SSSE3-NEXT: psrld $1, %xmm2
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm5, %xmm7
+; SSSE3-NEXT: pandn %xmm6, %xmm7
+; SSSE3-NEXT: psrld $1, %xmm5
+; SSSE3-NEXT: por %xmm7, %xmm5
+; SSSE3-NEXT: pand %xmm0, %xmm5
+; SSSE3-NEXT: pandn %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm5, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2
-; SSSE3-NEXT: paddd %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pxor %xmm6, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm5
-; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm6, %xmm2
-; SSSE3-NEXT: pandn %xmm4, %xmm2
-; SSSE3-NEXT: psrld $1, %xmm6
-; SSSE3-NEXT: por %xmm2, %xmm6
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pandn %xmm1, %xmm5
-; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: paddd %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm2
+; SSSE3-NEXT: pandn %xmm6, %xmm2
+; SSSE3-NEXT: psrld $1, %xmm4
+; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: pand %xmm1, %xmm4
+; SSSE3-NEXT: pandn %xmm3, %xmm1
+; SSSE3-NEXT: por %xmm4, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm8, %xmm8
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm7
-; SSE41-NEXT: paddd %xmm2, %xmm6
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT: pandn %xmm7, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: movaps %xmm5, %xmm7
-; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm7
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm6
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: paddd %xmm3, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm8
-; SSE41-NEXT: pxor %xmm8, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE41-NEXT: pandn %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm5
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: paddd %xmm2, %xmm5
+; SSE41-NEXT: movaps {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT: movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: movaps %xmm6, %xmm7
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE41-NEXT: pxor %xmm2, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: movaps %xmm6, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2
+; SSE41-NEXT: movaps %xmm5, %xmm0
+; SSE41-NEXT: movaps %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm9, %xmm3, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %ymm1, {{.*}}(%rip), %ymm3, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm5
+; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vblendvps %ymm5, {{.*}}(%rip), %ymm6, %ymm6
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vblendvps %ymm0, %ymm6, %ymm5, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vblendvps %ymm0, %ymm3, %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %ymm0, %ymm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k2}
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512-NEXT: vmovdqa %ymm1, %ymm0
; AVX512-NEXT: retq
%z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
ret <8 x i32> %z
@@ -1084,378 +915,230 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8
define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; SSE2-LABEL: v16i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm11
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: paddd %xmm0, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm9, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pxor %xmm12, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm12
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT: pxor %xmm9, %xmm12
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm12
-; SSE2-NEXT: paddd %xmm4, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm10, %xmm11
+; SSE2-NEXT: pandn %xmm9, %xmm11
+; SSE2-NEXT: psrld $1, %xmm10
+; SSE2-NEXT: por %xmm11, %xmm10
+; SSE2-NEXT: pand %xmm0, %xmm10
+; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: por %xmm10, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm12, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm12, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm10
+; SSE2-NEXT: pandn %xmm9, %xmm10
; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm11, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm11
-; SSE2-NEXT: pxor %xmm9, %xmm11
+; SSE2-NEXT: por %xmm10, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm11
-; SSE2-NEXT: paddd %xmm5, %xmm8
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm12, %xmm4
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: paddd %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: paddd %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm8
-; SSE2-NEXT: pxor %xmm9, %xmm8
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm8
-; SSE2-NEXT: pandn %xmm4, %xmm8
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: pandn %xmm12, %xmm4
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: por %xmm4, %xmm6
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm8
-; SSE2-NEXT: por %xmm6, %xmm8
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm9, %xmm5
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT: paddd %xmm7, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
-; SSE2-NEXT: pxor %xmm10, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
-; SSE2-NEXT: pandn %xmm2, %xmm9
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: pandn %xmm12, %xmm2
-; SSE2-NEXT: psrld $1, %xmm10
-; SSE2-NEXT: por %xmm2, %xmm10
-; SSE2-NEXT: pand %xmm9, %xmm10
-; SSE2-NEXT: pandn %xmm3, %xmm9
-; SSE2-NEXT: por %xmm10, %xmm9
-; SSE2-NEXT: movdqa %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT: paddd %xmm3, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: pandn %xmm9, %xmm4
+; SSE2-NEXT: psrld $1, %xmm8
+; SSE2-NEXT: por %xmm4, %xmm8
+; SSE2-NEXT: pand %xmm3, %xmm8
+; SSE2-NEXT: pandn %xmm7, %xmm3
+; SSE2-NEXT: por %xmm8, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v16i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm1, %xmm8
-; SSSE3-NEXT: movdqa %xmm0, %xmm11
+; SSSE3-NEXT: pxor %xmm8, %xmm8
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9
+; SSSE3-NEXT: paddd %xmm0, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm9, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm10
-; SSSE3-NEXT: pxor %xmm12, %xmm12
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT: pxor %xmm9, %xmm12
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm12
-; SSSE3-NEXT: paddd %xmm4, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm10
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm10, %xmm11
+; SSSE3-NEXT: pandn %xmm9, %xmm11
+; SSSE3-NEXT: psrld $1, %xmm10
+; SSSE3-NEXT: por %xmm11, %xmm10
+; SSSE3-NEXT: pand %xmm0, %xmm10
+; SSSE3-NEXT: pandn %xmm4, %xmm0
+; SSSE3-NEXT: por %xmm10, %xmm0
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm9, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm12, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm12, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: paddd %xmm1, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm10
+; SSSE3-NEXT: pandn %xmm9, %xmm10
; SSSE3-NEXT: psrld $1, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm0, %xmm4
-; SSSE3-NEXT: pandn %xmm11, %xmm0
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm11
-; SSSE3-NEXT: pxor %xmm9, %xmm11
+; SSSE3-NEXT: por %xmm10, %xmm4
+; SSSE3-NEXT: pand %xmm1, %xmm4
+; SSSE3-NEXT: pandn %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm4, %xmm1
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm11
-; SSSE3-NEXT: paddd %xmm5, %xmm8
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm11, %xmm1
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pandn %xmm12, %xmm4
-; SSSE3-NEXT: psrld $1, %xmm5
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT: paddd %xmm2, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm9, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: paddd %xmm6, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: movdqa %xmm6, %xmm8
-; SSSE3-NEXT: pxor %xmm9, %xmm8
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm8
-; SSSE3-NEXT: pandn %xmm4, %xmm8
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSSE3-NEXT: pandn %xmm12, %xmm4
-; SSSE3-NEXT: psrld $1, %xmm6
-; SSSE3-NEXT: por %xmm4, %xmm6
-; SSSE3-NEXT: pand %xmm8, %xmm6
-; SSSE3-NEXT: pandn %xmm2, %xmm8
-; SSSE3-NEXT: por %xmm6, %xmm8
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2
-; SSSE3-NEXT: pxor %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pandn %xmm9, %xmm5
+; SSSE3-NEXT: psrld $1, %xmm4
+; SSSE3-NEXT: por %xmm5, %xmm4
+; SSSE3-NEXT: pand %xmm2, %xmm4
+; SSSE3-NEXT: pandn %xmm6, %xmm2
+; SSSE3-NEXT: por %xmm4, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
-; SSSE3-NEXT: paddd %xmm7, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10
-; SSSE3-NEXT: pxor %xmm10, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9
-; SSSE3-NEXT: pandn %xmm2, %xmm9
-; SSSE3-NEXT: movdqa %xmm10, %xmm2
-; SSSE3-NEXT: pandn %xmm12, %xmm2
-; SSSE3-NEXT: psrld $1, %xmm10
-; SSSE3-NEXT: por %xmm2, %xmm10
-; SSSE3-NEXT: pand %xmm9, %xmm10
-; SSSE3-NEXT: pandn %xmm3, %xmm9
-; SSSE3-NEXT: por %xmm10, %xmm9
-; SSSE3-NEXT: movdqa %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
+; SSSE3-NEXT: paddd %xmm3, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT: movdqa %xmm8, %xmm4
+; SSSE3-NEXT: pandn %xmm9, %xmm4
+; SSSE3-NEXT: psrld $1, %xmm8
+; SSSE3-NEXT: por %xmm4, %xmm8
+; SSSE3-NEXT: pand %xmm3, %xmm8
+; SSSE3-NEXT: pandn %xmm7, %xmm3
+; SSSE3-NEXT: por %xmm8, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v16i32:
; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm3, %xmm8
+; SSE41-NEXT: movdqa %xmm2, %xmm12
+; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: pxor %xmm8, %xmm8
-; SSE41-NEXT: pxor %xmm11, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm11
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE41-NEXT: pxor %xmm10, %xmm11
-; SSE41-NEXT: pxor %xmm12, %xmm12
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm12
-; SSE41-NEXT: pxor %xmm10, %xmm12
-; SSE41-NEXT: pcmpeqd %xmm12, %xmm11
; SSE41-NEXT: paddd %xmm4, %xmm9
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm12, %xmm4
-; SSE41-NEXT: pandn %xmm11, %xmm4
-; SSE41-NEXT: movaps {{.*#+}} xmm12 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: movaps {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: movaps %xmm11, %xmm13
-; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm13
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm9
-; SSE41-NEXT: xorps %xmm13, %xmm13
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm13
-; SSE41-NEXT: pxor %xmm10, %xmm13
-; SSE41-NEXT: pxor %xmm14, %xmm14
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm14
-; SSE41-NEXT: pxor %xmm10, %xmm14
-; SSE41-NEXT: pcmpeqd %xmm14, %xmm13
-; SSE41-NEXT: paddd %xmm5, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm14, %xmm4
-; SSE41-NEXT: pandn %xmm13, %xmm4
-; SSE41-NEXT: movaps %xmm11, %xmm5
-; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: pxor %xmm13, %xmm13
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm13
-; SSE41-NEXT: pxor %xmm10, %xmm13
-; SSE41-NEXT: xorps %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm10, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm13
-; SSE41-NEXT: paddd %xmm6, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT: pandn %xmm13, %xmm4
-; SSE41-NEXT: movaps %xmm11, %xmm5
-; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm5
+; SSE41-NEXT: movaps {{.*#+}} xmm11 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT: movaps {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: movaps %xmm10, %xmm2
+; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm9
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: paddd %xmm5, %xmm4
+; SSE41-NEXT: movaps %xmm10, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: paddd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm8
-; SSE41-NEXT: pxor %xmm8, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm10
-; SSE41-NEXT: pandn %xmm0, %xmm10
+; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm5, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm12, %xmm3
+; SSE41-NEXT: paddd %xmm6, %xmm3
+; SSE41-NEXT: movaps %xmm10, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm12
+; SSE41-NEXT: pxor %xmm6, %xmm12
+; SSE41-NEXT: movdqa %xmm12, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm8, %xmm5
+; SSE41-NEXT: paddd %xmm7, %xmm5
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE41-NEXT: pxor %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm11
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm3
+; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5
; SSE41-NEXT: movaps %xmm9, %xmm0
+; SSE41-NEXT: movaps %xmm4, %xmm1
+; SSE41-NEXT: movaps %xmm3, %xmm2
+; SSE41-NEXT: movaps %xmm5, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: v16i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT: vpcmpgtd %xmm9, %xmm12, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm10
-; AVX1-NEXT: vpcmpeqd %xmm8, %xmm10, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm11
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm11, %xmm6, %xmm11
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
-; AVX1-NEXT: vpaddd %xmm9, %xmm7, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm9, %xmm12, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm10, %xmm10
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm12, %xmm2
-; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4
-; AVX1-NEXT: vandnps %ymm8, %ymm4, %ymm4
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7
; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %ymm7, %ymm8, %ymm10, %ymm7
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT: vblendvps %ymm4, %ymm7, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm12, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm12, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm11
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm12, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm11, %xmm7, %xmm11
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9
-; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm11
-; AVX1-NEXT: vpcmpgtd %xmm11, %xmm12, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm12, %xmm3
-; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT: vandnps %ymm9, %ymm2, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vblendvps %ymm3, %ymm8, %ymm10, %ymm3
-; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1
-; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vblendvps %ymm7, %ymm8, %ymm9, %ymm10
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vblendvps %ymm0, %ymm10, %ymm7, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm6
+; AVX1-NEXT: vblendvps %ymm6, %ymm8, %ymm9, %ymm7
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vblendvps %ymm1, %ymm7, %ymm6, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm8
-; AVX2-NEXT: vpcmpeqd %ymm8, %ymm7, %ymm7
-; AVX2-NEXT: vpandn %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm7 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm8 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %ymm2, %ymm7, %ymm8, %ymm2
-; AVX2-NEXT: vblendvps %ymm5, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm5
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm3
-; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm4
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vblendvps %ymm3, %ymm7, %ymm8, %ymm3
-; AVX2-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm4
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vblendvps %ymm4, %ymm5, %ymm6, %ymm7
+; AVX2-NEXT: vpcmpgtd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vblendvps %ymm0, %ymm7, %ymm4, %ymm0
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vblendvps %ymm2, %ymm5, %ymm6, %ymm4
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vblendvps %ymm1, %ymm4, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v16i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %zmm0, %zmm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k0
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k2}
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: retq
%z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
ret <16 x i32> %z
@@ -1465,152 +1148,120 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
; SSE2-LABEL: v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: paddq %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm1
; SSE2-NEXT: pxor %xmm5, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: paddq %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm1
; SSSE3-NEXT: pxor %xmm5, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pandn %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm2
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pand %xmm1, %xmm4
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: pand %xmm1, %xmm3
; SSSE3-NEXT: pandn %xmm0, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm3, %xmm0
; SSE41-NEXT: paddq %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pxor %xmm3, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm6
+; SSE41-NEXT: por %xmm0, %xmm6
+; SSE41-NEXT: pxor %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pand %xmm5, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm1
-; SSE41-NEXT: pandn %xmm4, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -1620,56 +1271,36 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
;
; AVX1-LABEL: v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm2, {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
-; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %xmm2, {{.*}}(%rip), %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k0
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovdqa %xmm1, %xmm0
; AVX512-NEXT: retq
%z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
ret <2 x i64> %z
@@ -1678,369 +1309,279 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-LABEL: v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm9
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE2-NEXT: paddq %xmm2, %xmm9
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm10, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,0,3,2]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: movdqa %xmm9, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm10, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: paddq %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2]
-; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pandn %xmm9, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm11, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT: pand %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn %xmm9, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm2
; SSE2-NEXT: paddq %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm10, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm6, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pandn %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm11, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm9
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSSE3-NEXT: paddq %xmm2, %xmm9
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm10, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6
-; SSSE3-NEXT: pxor %xmm6, %xmm7
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm10, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,0,3,2]
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: movdqa %xmm9, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm10, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm8, %xmm0
+; SSSE3-NEXT: paddq %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm8, %xmm6
+; SSSE3-NEXT: movdqa %xmm0, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm7
-; SSSE3-NEXT: pxor %xmm6, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2]
-; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: pxor %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm8, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm8, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pandn %xmm9, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm0, %xmm2
; SSSE3-NEXT: pandn %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: movdqa %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm11, %xmm2
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT: pand %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: pand %xmm0, %xmm5
-; SSSE3-NEXT: pandn %xmm9, %xmm0
-; SSSE3-NEXT: por %xmm5, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm8, %xmm2
; SSSE3-NEXT: paddq %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm10, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm6, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm10, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm8, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm6, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm10, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT: pand %xmm6, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm8, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pandn %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm11, %xmm3
-; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm2, %xmm4
+; SSSE3-NEXT: pand %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pandn %xmm9, %xmm4
+; SSSE3-NEXT: pand %xmm7, %xmm3
+; SSSE3-NEXT: por %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm4, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE41-NEXT: paddq %xmm2, %xmm9
-; SSE41-NEXT: pxor %xmm10, %xmm2
-; SSE41-NEXT: movdqa %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: paddq %xmm2, %xmm8
+; SSE41-NEXT: movdqa %xmm8, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: movdqa %xmm0, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: pand %xmm7, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm2, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm7
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm2
-; SSE41-NEXT: pandn %xmm7, %xmm2
-; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movapd %xmm11, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
+; SSE41-NEXT: movdqa %xmm5, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE41-NEXT: movdqa %xmm5, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: movapd %xmm6, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm9
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: paddq %xmm3, %xmm1
-; SSE41-NEXT: pxor %xmm10, %xmm3
-; SSE41-NEXT: movdqa %xmm10, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqq %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm0, %xmm4
-; SSE41-NEXT: pcmpeqq %xmm3, %xmm4
-; SSE41-NEXT: pandn %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1
-; SSE41-NEXT: movapd %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm7
+; SSE41-NEXT: por %xmm0, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm3
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm10, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm7, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movapd %xmm8, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqq %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqq %xmm9, %xmm4, %xmm9
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT: vpaddq %xmm2, %xmm6, %xmm9
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm3, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vandnpd %ymm8, %ymm2, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vmovapd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vblendvpd %ymm1, {{.*}}(%rip), %ymm3, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm5
+; AVX1-NEXT: vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vblendvpd %ymm5, {{.*}}(%rip), %ymm6, %ymm6
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vxorpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vblendvpd %ymm0, %ymm6, %ymm5, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2
-; AVX2-NEXT: vpcmpeqq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %ymm2, %ymm1, %k0
-; AVX512-NEXT: vpcmpnltq %ymm2, %ymm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpcmpnltq %ymm2, %ymm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtq %ymm0, %ymm2, %k2
-; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k2}
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: vpcmpgtq %ymm1, %ymm2, %k0
+; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtq %ymm1, %ymm2, %k2
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k2}
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512-NEXT: vmovdqa %ymm1, %ymm0
; AVX512-NEXT: retq
%z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
ret <4 x i64> %z
@@ -2050,687 +1591,513 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
; SSE2-LABEL: v8i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm13
+; SSE2-NEXT: movdqa %xmm0, %xmm12
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE2-NEXT: paddq %xmm4, %xmm13
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT: paddq %xmm4, %xmm12
+; SSE2-NEXT: movdqa %xmm12, %xmm1
+; SSE2-NEXT: pxor %xmm9, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm14
-; SSE2-NEXT: pxor %xmm10, %xmm14
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,0,3,2]
-; SSE2-NEXT: pand %xmm1, %xmm11
-; SSE2-NEXT: movdqa %xmm13, %xmm0
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm10
+; SSE2-NEXT: pxor %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm9, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm11, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm11, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT: pand %xmm12, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm13, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm10, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm12, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm9, %xmm1
; SSE2-NEXT: paddq %xmm5, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm13, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm12
; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm9, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm13, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm12, %xmm1
+; SSE2-NEXT: movdqa %xmm9, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm15
-; SSE2-NEXT: pxor %xmm10, %xmm15
-; SSE2-NEXT: pcmpeqd %xmm15, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm13
-; SSE2-NEXT: movdqa %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm14, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm15, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm13, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm11, %xmm5
; SSE2-NEXT: pand %xmm12, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm10, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm4
; SSE2-NEXT: paddq %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm8
; SSE2-NEXT: pxor %xmm9, %xmm6
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm9, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2]
-; SSE2-NEXT: pand %xmm5, %xmm8
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm11, %xmm5
-; SSE2-NEXT: pand %xmm12, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: por %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm10, %xmm5
+; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm5
; SSE2-NEXT: paddq %xmm7, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm9, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
; SSE2-NEXT: pxor %xmm9, %xmm7
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm9, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm7
-; SSE2-NEXT: pxor %xmm7, %xmm10
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,0,3,2]
-; SSE2-NEXT: pand %xmm10, %xmm5
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pandn %xmm11, %xmm2
-; SSE2-NEXT: pand %xmm12, %xmm7
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm5, %xmm7
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm2
+; SSE2-NEXT: pandn %xmm10, %xmm2
+; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: por %xmm2, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa %xmm1, %xmm8
-; SSSE3-NEXT: movdqa %xmm0, %xmm13
+; SSSE3-NEXT: movdqa %xmm0, %xmm12
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSSE3-NEXT: paddq %xmm4, %xmm13
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm10
-; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: pxor %xmm9, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT: paddq %xmm4, %xmm12
+; SSSE3-NEXT: movdqa %xmm12, %xmm1
+; SSSE3-NEXT: pxor %xmm9, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm14
-; SSSE3-NEXT: pxor %xmm10, %xmm14
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,0,3,2]
-; SSSE3-NEXT: pand %xmm1, %xmm11
-; SSSE3-NEXT: movdqa %xmm13, %xmm0
-; SSSE3-NEXT: pxor %xmm9, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm10
+; SSSE3-NEXT: pxor %xmm9, %xmm4
+; SSSE3-NEXT: movdqa %xmm9, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm11, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm10, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm11, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pandn %xmm11, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT: pand %xmm12, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm13, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm10, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pandn %xmm10, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
+; SSSE3-NEXT: pand %xmm11, %xmm4
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pand %xmm0, %xmm4
+; SSSE3-NEXT: pandn %xmm12, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
; SSSE3-NEXT: movdqa %xmm8, %xmm1
+; SSSE3-NEXT: pxor %xmm9, %xmm1
; SSSE3-NEXT: paddq %xmm5, %xmm8
+; SSSE3-NEXT: movdqa %xmm8, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, %xmm12
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm13, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm12
; SSSE3-NEXT: pxor %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm9, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm13, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm15
-; SSSE3-NEXT: pxor %xmm10, %xmm15
-; SSSE3-NEXT: pcmpeqd %xmm15, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm13
-; SSSE3-NEXT: movdqa %xmm8, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm14, %xmm1
+; SSSE3-NEXT: por %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm12, %xmm1
+; SSSE3-NEXT: movdqa %xmm9, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm15, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,0,3,2]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pandn %xmm13, %xmm1
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm11, %xmm5
; SSSE3-NEXT: pand %xmm12, %xmm4
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pand %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm4
+; SSSE3-NEXT: pandn %xmm10, %xmm4
+; SSSE3-NEXT: pand %xmm11, %xmm5
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm1, %xmm5
; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm4
; SSSE3-NEXT: paddq %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm12, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm8
; SSSE3-NEXT: pxor %xmm9, %xmm6
-; SSSE3-NEXT: movdqa %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm9, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pxor %xmm9, %xmm4
+; SSSE3-NEXT: pand %xmm12, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm4
; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm10, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2]
-; SSSE3-NEXT: pand %xmm5, %xmm8
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm6
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm11, %xmm5
-; SSSE3-NEXT: pand %xmm12, %xmm4
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pandn %xmm2, %xmm6
-; SSSE3-NEXT: por %xmm4, %xmm6
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm5
+; SSSE3-NEXT: pandn %xmm10, %xmm5
+; SSSE3-NEXT: pand %xmm11, %xmm6
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm4, %xmm6
+; SSSE3-NEXT: pandn %xmm2, %xmm4
+; SSSE3-NEXT: por %xmm6, %xmm4
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm9, %xmm5
; SSSE3-NEXT: paddq %xmm7, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
; SSSE3-NEXT: pxor %xmm9, %xmm7
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm9, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm5
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm7
-; SSSE3-NEXT: pxor %xmm7, %xmm10
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,0,3,2]
-; SSSE3-NEXT: pand %xmm10, %xmm5
-; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm7, %xmm2
-; SSSE3-NEXT: pandn %xmm11, %xmm2
-; SSSE3-NEXT: pand %xmm12, %xmm7
-; SSSE3-NEXT: por %xmm2, %xmm7
-; SSSE3-NEXT: pand %xmm5, %xmm7
-; SSSE3-NEXT: pandn %xmm3, %xmm5
-; SSSE3-NEXT: por %xmm7, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm6
; SSSE3-NEXT: movdqa %xmm6, %xmm2
+; SSSE3-NEXT: pandn %xmm10, %xmm2
+; SSSE3-NEXT: pand %xmm11, %xmm6
+; SSSE3-NEXT: por %xmm2, %xmm6
+; SSSE3-NEXT: pand %xmm5, %xmm6
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm2
; SSSE3-NEXT: movdqa %xmm5, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm0, %xmm11
+; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: paddq %xmm4, %xmm8
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: movdqa %xmm8, %xmm10
+; SSE41-NEXT: pxor %xmm9, %xmm10
+; SSE41-NEXT: movdqa %xmm0, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm10, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm12
+; SSE41-NEXT: por %xmm0, %xmm12
+; SSE41-NEXT: pxor %xmm9, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm0
; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm12
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE41-NEXT: pxor %xmm9, %xmm12
-; SSE41-NEXT: pxor %xmm10, %xmm11
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm11, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm12, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm11
; SSE41-NEXT: pcmpeqd %xmm10, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm11
-; SSE41-NEXT: pxor %xmm9, %xmm11
-; SSE41-NEXT: pcmpeqq %xmm11, %xmm12
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm14, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm9, %xmm4
-; SSE41-NEXT: pcmpeqq %xmm11, %xmm4
-; SSE41-NEXT: pandn %xmm12, %xmm4
-; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movapd %xmm11, %xmm13
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm13
+; SSE41-NEXT: movdqa %xmm9, %xmm12
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm12
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm12, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: movapd %xmm10, %xmm12
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm8
-; SSE41-NEXT: movdqa %xmm1, %xmm14
+; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: paddq %xmm5, %xmm1
-; SSE41-NEXT: pxor %xmm10, %xmm5
-; SSE41-NEXT: movdqa %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm15
-; SSE41-NEXT: pxor %xmm9, %xmm15
-; SSE41-NEXT: pxor %xmm10, %xmm14
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm14, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm12
+; SSE41-NEXT: pxor %xmm9, %xmm12
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm12, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm5
+; SSE41-NEXT: pand %xmm4, %xmm13
+; SSE41-NEXT: por %xmm0, %xmm13
; SSE41-NEXT: pxor %xmm9, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm15
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm14, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm9, %xmm4
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT: pandn %xmm15, %xmm4
-; SSE41-NEXT: movapd %xmm11, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5
+; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm14, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm13, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpeqd %xmm12, %xmm13
+; SSE41-NEXT: movdqa %xmm9, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pand %xmm13, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: movapd %xmm10, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: paddq %xmm6, %xmm2
-; SSE41-NEXT: pxor %xmm10, %xmm6
-; SSE41-NEXT: movdqa %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm5
-; SSE41-NEXT: pxor %xmm9, %xmm5
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm9, %xmm6
-; SSE41-NEXT: pcmpeqq %xmm6, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm14, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm12
+; SSE41-NEXT: pxor %xmm9, %xmm12
; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm9, %xmm4
-; SSE41-NEXT: pcmpeqq %xmm6, %xmm4
-; SSE41-NEXT: pandn %xmm5, %xmm4
-; SSE41-NEXT: movapd %xmm11, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm12, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm5
+; SSE41-NEXT: por %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm9, %xmm6
+; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm13, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm12, %xmm5
+; SSE41-NEXT: movdqa %xmm9, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movapd %xmm10, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: paddq %xmm7, %xmm3
-; SSE41-NEXT: pxor %xmm10, %xmm7
-; SSE41-NEXT: movdqa %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm4
-; SSE41-NEXT: pxor %xmm9, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm5
+; SSE41-NEXT: movdqa %xmm3, %xmm5
; SSE41-NEXT: pxor %xmm9, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm0, %xmm9
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm9
-; SSE41-NEXT: pandn %xmm4, %xmm9
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm6
+; SSE41-NEXT: por %xmm0, %xmm6
+; SSE41-NEXT: pxor %xmm9, %xmm7
; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE41-NEXT: pand %xmm12, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm6, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
; SSE41-NEXT: movapd %xmm8, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm12, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm10
-; AVX1-NEXT: vpcmpeqq %xmm8, %xmm10, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm11
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqq %xmm11, %xmm6, %xmm11
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
-; AVX1-NEXT: vpaddq %xmm9, %xmm7, %xmm9
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm12, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm4
-; AVX1-NEXT: vpcmpeqq %xmm4, %xmm10, %xmm10
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm12, %xmm2
-; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpeqq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4
-; AVX1-NEXT: vandnpd %ymm8, %ymm4, %ymm4
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7
; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX1-NEXT: vmovapd {{.*#+}} ymm10 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vblendvpd %ymm7, %ymm8, %ymm10, %ymm7
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT: vblendvpd %ymm4, %ymm7, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm12, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqq %xmm7, %xmm6, %xmm9
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm12, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm11
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm12, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqq %xmm11, %xmm7, %xmm11
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9
-; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm11
-; AVX1-NEXT: vpcmpgtq %xmm11, %xmm12, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm12, %xmm3
-; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpcmpeqq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT: vandnpd %ymm9, %ymm2, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vblendvpd %ymm3, %ymm8, %ymm10, %ymm3
-; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1
-; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vmovapd {{.*#+}} ymm9 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vblendvpd %ymm7, %ymm8, %ymm9, %ymm10
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorpd %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vblendvpd %ymm0, %ymm10, %ymm7, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm6
+; AVX1-NEXT: vblendvpd %ymm6, %ymm8, %ymm9, %ymm7
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vxorpd %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vblendvpd %ymm1, %ymm7, %ymm6, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpcmpeqq %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm8
-; AVX2-NEXT: vpcmpeqq %ymm8, %ymm7, %ymm7
-; AVX2-NEXT: vpandn %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm8 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vblendvpd %ymm2, %ymm7, %ymm8, %ymm2
-; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm3
-; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm4
-; AVX2-NEXT: vpcmpeqq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vblendvpd %ymm3, %ymm7, %ymm8, %ymm3
-; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm4
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm6, %ymm7
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm4, %ymm0
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm6, %ymm4
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vblendvpd %ymm1, %ymm4, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %zmm2, %zmm1, %k0
-; AVX512-NEXT: vpcmpnltq %zmm2, %zmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpcmpnltq %zmm2, %zmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k2
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k2}
-; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpgtq %zmm1, %zmm2, %k0
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtq %zmm1, %zmm2, %k2
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k2}
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: retq
%z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
ret <8 x i64> %z
Modified: llvm/trunk/test/CodeGen/X86/ssub_sat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ssub_sat.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/ssub_sat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/ssub_sat.ll Mon Sep 30 00:58:50 2019
@@ -183,30 +183,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x
;
; X64-LABEL: vec:
; X64: # %bb.0:
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pxor %xmm3, %xmm3
-; X64-NEXT: pxor %xmm0, %xmm0
-; X64-NEXT: pcmpgtd %xmm1, %xmm0
-; X64-NEXT: pcmpeqd %xmm4, %xmm4
-; X64-NEXT: pxor %xmm4, %xmm0
-; X64-NEXT: pxor %xmm5, %xmm5
-; X64-NEXT: pcmpgtd %xmm2, %xmm5
-; X64-NEXT: pxor %xmm4, %xmm5
-; X64-NEXT: pcmpeqd %xmm5, %xmm0
-; X64-NEXT: psubd %xmm1, %xmm2
-; X64-NEXT: pcmpgtd %xmm2, %xmm3
-; X64-NEXT: movdqa %xmm3, %xmm1
-; X64-NEXT: pxor %xmm4, %xmm1
-; X64-NEXT: pcmpeqd %xmm5, %xmm1
-; X64-NEXT: pxor %xmm4, %xmm1
-; X64-NEXT: pandn %xmm1, %xmm0
-; X64-NEXT: movdqa %xmm3, %xmm1
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: psubd %xmm1, %xmm3
+; X64-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-NEXT: pcmpgtd %xmm3, %xmm0
+; X64-NEXT: pxor %xmm1, %xmm0
+; X64-NEXT: pcmpgtd %xmm3, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm1
; X64-NEXT: pandn {{.*}}(%rip), %xmm1
-; X64-NEXT: psrld $1, %xmm3
-; X64-NEXT: por %xmm1, %xmm3
-; X64-NEXT: pand %xmm0, %xmm3
-; X64-NEXT: pandn %xmm2, %xmm0
-; X64-NEXT: por %xmm3, %xmm0
+; X64-NEXT: psrld $1, %xmm2
+; X64-NEXT: por %xmm1, %xmm2
+; X64-NEXT: pand %xmm0, %xmm2
+; X64-NEXT: pandn %xmm3, %xmm0
+; X64-NEXT: por %xmm2, %xmm0
; X64-NEXT: retq
%tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %tmp
Modified: llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll Mon Sep 30 00:58:50 2019
@@ -598,141 +598,94 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: psubd %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm0, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: psubd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psubd %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT: psrld $1, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: psrld $1, %xmm2
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm3, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: pandn %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psubd %xmm1, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovdqa %xmm1, %xmm0
; AVX512-NEXT: retq
%z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
@@ -741,141 +694,94 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2
define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-LABEL: v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: psubd %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm0, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: psubd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psubd %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT: psrld $1, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: psrld $1, %xmm2
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm3, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: pandn %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psubd %xmm1, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovdqa %xmm1, %xmm0
; AVX512-NEXT: retq
%z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %z
@@ -884,226 +790,144 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4
define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm0
-; SSE2-NEXT: psubd %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm6, %xmm2
-; SSE2-NEXT: pandn %xmm7, %xmm2
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT: psubd %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm3
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psubd %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psubd %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm6, %xmm3
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm0, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT: pxor %xmm8, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm0
-; SSSE3-NEXT: psubd %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: movdqa %xmm6, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: pandn %xmm2, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm6, %xmm2
-; SSSE3-NEXT: pandn %xmm7, %xmm2
-; SSSE3-NEXT: psrld $1, %xmm6
-; SSSE3-NEXT: por %xmm2, %xmm6
-; SSSE3-NEXT: pand %xmm0, %xmm6
-; SSSE3-NEXT: pandn %xmm4, %xmm0
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
-; SSSE3-NEXT: psubd %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm3
-; SSSE3-NEXT: pxor %xmm8, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
-; SSSE3-NEXT: pxor %xmm8, %xmm3
-; SSSE3-NEXT: pandn %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm5, %xmm3
-; SSSE3-NEXT: pandn %xmm7, %xmm3
-; SSSE3-NEXT: psrld $1, %xmm5
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm2, %xmm5
-; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: psubd %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm2, %xmm7
+; SSSE3-NEXT: pandn %xmm6, %xmm7
+; SSSE3-NEXT: psrld $1, %xmm2
+; SSSE3-NEXT: por %xmm7, %xmm2
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm5, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: psubd %xmm3, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: pandn %xmm6, %xmm3
+; SSSE3-NEXT: psrld $1, %xmm4
+; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: pand %xmm1, %xmm4
+; SSSE3-NEXT: pandn %xmm2, %xmm1
+; SSSE3-NEXT: por %xmm4, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i32:
; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm8, %xmm8
; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm6, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE41-NEXT: pxor %xmm8, %xmm6
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm8, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
; SSE41-NEXT: psubd %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm8, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm2
-; SSE41-NEXT: pxor %xmm8, %xmm2
-; SSE41-NEXT: pandn %xmm2, %xmm6
-; SSE41-NEXT: movaps {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: movaps {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: movaps %xmm7, %xmm2
-; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm2
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm5
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm8, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm8, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: psubd %xmm3, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pxor %xmm8, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm8, %xmm3
-; SSE41-NEXT: pandn %xmm3, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE41-NEXT: pxor %xmm2, %xmm4
+; SSE41-NEXT: movaps {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT: movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: movaps %xmm6, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm7
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psubd %xmm3, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm1
+; SSE41-NEXT: blendvps %xmm0, %xmm7, %xmm6
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2
; SSE41-NEXT: movaps %xmm5, %xmm0
+; SSE41-NEXT: movaps %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm9, %xmm3, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vandnps %ymm2, %ymm8, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %ymm1, {{.*}}(%rip), %ymm3, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vblendvps %ymm1, {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %ymm0, %ymm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k2}
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512-NEXT: vmovdqa %ymm1, %ymm0
; AVX512-NEXT: retq
%z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
ret <8 x i32> %z
@@ -1112,399 +936,244 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8
define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; SSE2-LABEL: v16i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm12
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm11
-; SSE2-NEXT: pxor %xmm10, %xmm11
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: psubd %xmm4, %xmm12
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm11, %xmm4
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm12, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pxor %xmm12, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm12
-; SSE2-NEXT: pxor %xmm10, %xmm12
-; SSE2-NEXT: pcmpeqd %xmm12, %xmm1
-; SSE2-NEXT: psubd %xmm5, %xmm8
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm12, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm11, %xmm5
-; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
-; SSE2-NEXT: pxor %xmm10, %xmm8
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm8
-; SSE2-NEXT: psubd %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: psubd %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: pandn %xmm6, %xmm8
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm11, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm4, %xmm11
+; SSE2-NEXT: pandn %xmm10, %xmm11
; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm8
-; SSE2-NEXT: por %xmm4, %xmm8
+; SSE2-NEXT: por %xmm11, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm9, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm5, %xmm1
; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE2-NEXT: psubd %xmm7, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pandn %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm9, %xmm2
-; SSE2-NEXT: pandn %xmm11, %xmm2
-; SSE2-NEXT: psrld $1, %xmm9
-; SSE2-NEXT: por %xmm2, %xmm9
-; SSE2-NEXT: pand %xmm5, %xmm9
-; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pandn %xmm10, %xmm9
+; SSE2-NEXT: psrld $1, %xmm5
; SSE2-NEXT: por %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psubd %xmm6, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm10, %xmm6
+; SSE2-NEXT: psrld $1, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psubd %xmm7, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm7, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm10, %xmm5
+; SSE2-NEXT: psrld $1, %xmm8
+; SSE2-NEXT: por %xmm5, %xmm8
+; SSE2-NEXT: pand %xmm3, %xmm8
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: por %xmm8, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v16i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm1, %xmm8
-; SSSE3-NEXT: movdqa %xmm0, %xmm12
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pxor %xmm0, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm10
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm12, %xmm11
-; SSSE3-NEXT: pxor %xmm10, %xmm11
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: psubd %xmm4, %xmm12
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm12, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pandn %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pandn %xmm11, %xmm4
-; SSSE3-NEXT: psrld $1, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm12, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: pxor %xmm12, %xmm12
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm12
-; SSSE3-NEXT: pxor %xmm10, %xmm12
-; SSSE3-NEXT: pcmpeqd %xmm12, %xmm1
-; SSSE3-NEXT: psubd %xmm5, %xmm8
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm12, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pandn %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm11, %xmm5
-; SSSE3-NEXT: psrld $1, %xmm4
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pand %xmm1, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
; SSSE3-NEXT: pxor %xmm8, %xmm8
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8
-; SSSE3-NEXT: pxor %xmm10, %xmm8
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm8
-; SSSE3-NEXT: psubd %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm9
+; SSSE3-NEXT: psubd %xmm4, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm0
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm10, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm10, %xmm6
-; SSSE3-NEXT: pandn %xmm6, %xmm8
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm11, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm4, %xmm11
+; SSSE3-NEXT: pandn %xmm10, %xmm11
; SSSE3-NEXT: psrld $1, %xmm4
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: pandn %xmm2, %xmm8
-; SSSE3-NEXT: por %xmm4, %xmm8
+; SSSE3-NEXT: por %xmm11, %xmm4
+; SSSE3-NEXT: pand %xmm0, %xmm4
+; SSSE3-NEXT: pandn %xmm9, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: psubd %xmm5, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
+; SSSE3-NEXT: pxor %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5
-; SSSE3-NEXT: psubd %xmm7, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm9
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pandn %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm9, %xmm2
-; SSSE3-NEXT: pandn %xmm11, %xmm2
-; SSSE3-NEXT: psrld $1, %xmm9
-; SSSE3-NEXT: por %xmm2, %xmm9
-; SSSE3-NEXT: pand %xmm5, %xmm9
-; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: pandn %xmm10, %xmm9
+; SSSE3-NEXT: psrld $1, %xmm5
; SSSE3-NEXT: por %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm5, %xmm3
+; SSSE3-NEXT: pand %xmm1, %xmm5
+; SSSE3-NEXT: pandn %xmm4, %xmm1
+; SSSE3-NEXT: por %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: psubd %xmm6, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT: pxor %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pandn %xmm10, %xmm6
+; SSSE3-NEXT: psrld $1, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: pand %xmm2, %xmm5
+; SSSE3-NEXT: pandn %xmm4, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: psubd %xmm7, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm7, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8
+; SSSE3-NEXT: movdqa %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm10, %xmm5
+; SSSE3-NEXT: psrld $1, %xmm8
+; SSSE3-NEXT: por %xmm5, %xmm8
+; SSSE3-NEXT: pand %xmm3, %xmm8
+; SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm8, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v16i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: pxor %xmm8, %xmm8
+; SSE41-NEXT: movdqa %xmm3, %xmm8
+; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: pxor %xmm10, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm11, %xmm11
-; SSE41-NEXT: pxor %xmm11, %xmm10
-; SSE41-NEXT: pxor %xmm12, %xmm12
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm12
-; SSE41-NEXT: pxor %xmm11, %xmm12
-; SSE41-NEXT: pcmpeqd %xmm12, %xmm10
+; SSE41-NEXT: movdqa %xmm0, %xmm9
; SSE41-NEXT: psubd %xmm4, %xmm9
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm11, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm12, %xmm4
-; SSE41-NEXT: pxor %xmm11, %xmm4
-; SSE41-NEXT: pandn %xmm4, %xmm10
-; SSE41-NEXT: movaps {{.*#+}} xmm13 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: movaps {{.*#+}} xmm12 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: movaps %xmm12, %xmm4
-; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm4
-; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: movaps {{.*#+}} xmm12 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT: movaps {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: movaps %xmm11, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm4
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm9
-; SSE41-NEXT: xorps %xmm4, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT: pxor %xmm11, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm10
-; SSE41-NEXT: pxor %xmm11, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE41-NEXT: psubd %xmm5, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm11, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pxor %xmm11, %xmm5
-; SSE41-NEXT: pandn %xmm5, %xmm4
-; SSE41-NEXT: movaps %xmm12, %xmm5
-; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT: pxor %xmm11, %xmm4
-; SSE41-NEXT: xorps %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm11, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT: psubd %xmm6, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm11, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE41-NEXT: pxor %xmm11, %xmm6
-; SSE41-NEXT: pandn %xmm6, %xmm4
-; SSE41-NEXT: movaps %xmm12, %xmm5
-; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psubd %xmm5, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm5, %xmm1
+; SSE41-NEXT: movaps %xmm11, %xmm3
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT: pxor %xmm11, %xmm4
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: psubd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm8
+; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psubd %xmm6, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT: pxor %xmm6, %xmm2
+; SSE41-NEXT: movaps %xmm11, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movdqa %xmm8, %xmm5
-; SSE41-NEXT: pxor %xmm11, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm11, %xmm5
-; SSE41-NEXT: pandn %xmm5, %xmm4
+; SSE41-NEXT: psubd %xmm7, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE41-NEXT: pxor %xmm7, %xmm8
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm11
; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm13, %xmm12
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm12, %xmm3
+; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm5
; SSE41-NEXT: movaps %xmm9, %xmm0
+; SSE41-NEXT: movaps %xmm4, %xmm1
+; SSE41-NEXT: movaps %xmm3, %xmm2
+; SSE41-NEXT: movaps %xmm5, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: v16i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10
-; AVX1-NEXT: vpcmpgtd %xmm9, %xmm10, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm10, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm12
-; AVX1-NEXT: vpcmpeqd %xmm8, %xmm12, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm10, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm11
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm10, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm11, %xmm5, %xmm11
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
-; AVX1-NEXT: vpsubd %xmm9, %xmm7, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm9, %xmm10, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm11
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm10, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5
-; AVX1-NEXT: vandnps %ymm5, %ymm8, %ymm5
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm7
-; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm11 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %ymm7, %ymm8, %ymm11, %ymm7
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT: vblendvps %ymm5, %ymm7, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm10, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm10, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm10, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm10, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm12, %xmm7, %xmm12
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9
-; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm12
-; AVX1-NEXT: vpcmpgtd %xmm12, %xmm10, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm10, %xmm3
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vandnps %ymm2, %ymm9, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX1-NEXT: vblendvps %ymm3, %ymm8, %ymm11, %ymm3
-; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1
-; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm6, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vblendvps %ymm2, %ymm4, %ymm6, %ymm7
+; AVX1-NEXT: vblendvps %ymm0, %ymm7, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm7
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT: vpsubd %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm7, %xmm7
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vblendvps %ymm2, %ymm4, %ymm6, %ymm3
+; AVX1-NEXT: vblendvps %ymm1, %ymm3, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm8
-; AVX2-NEXT: vpcmpeqd %ymm8, %ymm7, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpandn %ymm7, %ymm5, %ymm5
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm7 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm8 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %ymm2, %ymm7, %ymm8, %ymm2
-; AVX2-NEXT: vblendvps %ymm5, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm5
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm3
-; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm4
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vblendvps %ymm3, %ymm7, %ymm8, %ymm3
-; AVX2-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm5
+; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vblendvps %ymm2, %ymm5, %ymm6, %ymm7
+; AVX2-NEXT: vblendvps %ymm0, %ymm7, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm2
+; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vblendvps %ymm3, %ymm5, %ymm6, %ymm2
+; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v16i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %zmm0, %zmm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k0
+; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k2}
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: retq
%z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
ret <16 x i32> %z
@@ -1514,50 +1183,38 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
; SSE2-LABEL: v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: pxor %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
; SSE2-NEXT: por %xmm2, %xmm3
@@ -1570,50 +1227,38 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
; SSSE3-LABEL: v2i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: psubq %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm6
-; SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm1
+; SSSE3-NEXT: pxor %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: pandn %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm2
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3
; SSSE3-NEXT: por %xmm2, %xmm3
@@ -1626,46 +1271,32 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
; SSE41-LABEL: v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm3, %xmm0
; SSE41-NEXT: psubq %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pxor %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm6
+; SSE41-NEXT: por %xmm0, %xmm6
+; SSE41-NEXT: pxor %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm1
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pandn %xmm3, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -1676,57 +1307,39 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
; AVX1-LABEL: v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
-; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpcmpgtq %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2}
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovdqa %xmm1, %xmm0
; AVX512-NEXT: retq
%z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
ret <2 x i64> %z
@@ -1735,381 +1348,285 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2
define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-LABEL: v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE2-NEXT: psubq %xmm2, %xmm10
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: psubq %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT: pxor %xmm9, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: movdqa %xmm10, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm9, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT: pand %xmm11, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm10, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pandn %xmm9, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm2
; SSE2-NEXT: psubq %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm9, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm9, %xmm3
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm10
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSSE3-NEXT: psubq %xmm2, %xmm10
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm8, %xmm0
+; SSSE3-NEXT: psubq %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm8, %xmm6
+; SSSE3-NEXT: movdqa %xmm0, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT: pxor %xmm9, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm9, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2]
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: movdqa %xmm10, %xmm6
-; SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSSE3-NEXT: movdqa %xmm5, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm8, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm9, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,0,3,2]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pxor %xmm9, %xmm2
-; SSSE3-NEXT: pandn %xmm2, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT: pand %xmm11, %xmm4
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pand %xmm0, %xmm4
-; SSSE3-NEXT: pandn %xmm10, %xmm0
-; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pandn %xmm9, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm8, %xmm2
; SSSE3-NEXT: psubq %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm5, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm9, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm8, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pxor %xmm9, %xmm3
-; SSSE3-NEXT: pandn %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm5, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: pand %xmm11, %xmm5
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm2, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pandn %xmm9, %xmm4
+; SSSE3-NEXT: pand %xmm7, %xmm3
+; SSSE3-NEXT: por %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
-; SSE41-NEXT: psubq %xmm2, %xmm9
-; SSE41-NEXT: pxor %xmm11, %xmm2
-; SSE41-NEXT: movdqa %xmm11, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm11, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: psubq %xmm2, %xmm8
+; SSE41-NEXT: movdqa %xmm8, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: movdqa %xmm0, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE41-NEXT: pxor %xmm10, %xmm2
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm11, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE41-NEXT: por %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pcmpeqq %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm11, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm10, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm4, %xmm5
-; SSE41-NEXT: pxor %xmm10, %xmm5
-; SSE41-NEXT: pandn %xmm5, %xmm2
-; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movapd %xmm7, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psubq %xmm3, %xmm1
-; SSE41-NEXT: pxor %xmm11, %xmm3
-; SSE41-NEXT: movdqa %xmm11, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm11, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm10, %xmm2
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm11, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE41-NEXT: movdqa %xmm5, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm10, %xmm3
-; SSE41-NEXT: pcmpeqq %xmm3, %xmm2
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: movapd %xmm6, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm11, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pcmpeqq %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pandn %xmm4, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: psubq %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm7
+; SSE41-NEXT: por %xmm0, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm10, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm7, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: movapd %xmm9, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movapd %xmm8, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqq %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqq %xmm9, %xmm4, %xmm9
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm9
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm3, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vandnpd %ymm2, %ymm8, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vmovapd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vblendvpd %ymm1, {{.*}}(%rip), %ymm3, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorpd %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vblendvpd %ymm1, {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2
-; AVX2-NEXT: vpcmpeqq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %ymm2, %ymm1, %k0
-; AVX512-NEXT: vpcmpnltq %ymm2, %ymm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpcmpnltq %ymm2, %ymm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtq %ymm0, %ymm2, %k2
-; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k2}
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: vpcmpgtq %ymm2, %ymm1, %k0
+; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtq %ymm1, %ymm2, %k2
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k2}
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512-NEXT: vmovdqa %ymm1, %ymm0
; AVX512-NEXT: retq
%z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
ret <4 x i64> %z
@@ -2119,414 +1636,324 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
; SSE2-LABEL: v8i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm13
+; SSE2-NEXT: movdqa %xmm0, %xmm12
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE2-NEXT: psubq %xmm4, %xmm13
+; SSE2-NEXT: pxor %xmm9, %xmm0
+; SSE2-NEXT: psubq %xmm4, %xmm12
+; SSE2-NEXT: movdqa %xmm12, %xmm1
+; SSE2-NEXT: pxor %xmm9, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm10
; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT: pand %xmm11, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm15
-; SSE2-NEXT: pxor %xmm10, %xmm15
-; SSE2-NEXT: pcmpeqd %xmm15, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm13, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm14
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm14, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm11
-; SSE2-NEXT: pxor %xmm10, %xmm11
-; SSE2-NEXT: pcmpeqd %xmm15, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,0,3,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm10, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
; SSE2-NEXT: pand %xmm11, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm11, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT: pand %xmm12, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm13, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm12, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm9, %xmm1
; SSE2-NEXT: psubq %xmm5, %xmm8
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm13, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm12
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm9, %xmm13
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm13
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm14, %xmm15
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,3,3]
-; SSE2-NEXT: por %xmm15, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm13
-; SSE2-NEXT: pxor %xmm10, %xmm13
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm13
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,0,3,2]
; SSE2-NEXT: pand %xmm13, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm11, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm12, %xmm1
+; SSE2-NEXT: movdqa %xmm9, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm12, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm10, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm4
; SSE2-NEXT: psubq %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm8
; SSE2-NEXT: pxor %xmm9, %xmm6
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2]
-; SSE2-NEXT: pand %xmm5, %xmm8
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: pandn %xmm6, %xmm8
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm11, %xmm5
-; SSE2-NEXT: pand %xmm12, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm8
-; SSE2-NEXT: por %xmm4, %xmm8
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: psubq %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm9, %xmm7
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm10, %xmm5
+; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: psubq %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm9, %xmm7
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm11, %xmm2
-; SSE2-NEXT: pand %xmm12, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm2
+; SSE2-NEXT: pandn %xmm10, %xmm2
+; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: por %xmm2, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa %xmm1, %xmm8
-; SSSE3-NEXT: movdqa %xmm0, %xmm13
+; SSSE3-NEXT: movdqa %xmm0, %xmm12
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSSE3-NEXT: psubq %xmm4, %xmm13
+; SSSE3-NEXT: pxor %xmm9, %xmm0
+; SSSE3-NEXT: psubq %xmm4, %xmm12
+; SSSE3-NEXT: movdqa %xmm12, %xmm1
+; SSSE3-NEXT: pxor %xmm9, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm11, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm10
; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm10
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT: pand %xmm11, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm15
-; SSSE3-NEXT: pxor %xmm10, %xmm15
-; SSSE3-NEXT: pcmpeqd %xmm15, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm13, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm10, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm14
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm14, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm11
-; SSSE3-NEXT: pxor %xmm10, %xmm11
-; SSSE3-NEXT: pcmpeqd %xmm15, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,0,3,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm10, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pandn %xmm10, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
; SSSE3-NEXT: pand %xmm11, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pandn %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pandn %xmm11, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT: pand %xmm12, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm13, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pand %xmm0, %xmm4
+; SSSE3-NEXT: pandn %xmm12, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
; SSSE3-NEXT: movdqa %xmm8, %xmm1
+; SSSE3-NEXT: pxor %xmm9, %xmm1
; SSSE3-NEXT: psubq %xmm5, %xmm8
+; SSSE3-NEXT: movdqa %xmm8, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, %xmm12
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm13, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm12
; SSSE3-NEXT: pxor %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm13, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm8, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm9, %xmm13
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm13
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSSE3-NEXT: por %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm12, %xmm1
+; SSSE3-NEXT: movdqa %xmm9, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm14, %xmm15
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,3,3]
-; SSSE3-NEXT: por %xmm15, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm13
-; SSSE3-NEXT: pxor %xmm10, %xmm13
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm13
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,0,3,2]
-; SSSE3-NEXT: pand %xmm13, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pandn %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm11, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pand %xmm12, %xmm4
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pand %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm4
+; SSSE3-NEXT: pandn %xmm10, %xmm4
+; SSSE3-NEXT: pand %xmm11, %xmm5
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm1, %xmm5
; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm4
; SSSE3-NEXT: psubq %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm12, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm8
; SSSE3-NEXT: pxor %xmm9, %xmm6
-; SSSE3-NEXT: movdqa %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm6, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pxor %xmm9, %xmm4
+; SSSE3-NEXT: pand %xmm12, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm4
; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm10, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,0,3,2]
-; SSSE3-NEXT: pand %xmm5, %xmm8
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm10, %xmm6
-; SSSE3-NEXT: pandn %xmm6, %xmm8
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm11, %xmm5
-; SSSE3-NEXT: pand %xmm12, %xmm4
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: pandn %xmm2, %xmm8
-; SSSE3-NEXT: por %xmm4, %xmm8
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: psubq %xmm7, %xmm3
-; SSSE3-NEXT: pxor %xmm9, %xmm7
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm5
+; SSSE3-NEXT: pandn %xmm10, %xmm5
+; SSSE3-NEXT: pand %xmm11, %xmm6
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm4, %xmm6
+; SSSE3-NEXT: pandn %xmm2, %xmm4
; SSSE3-NEXT: por %xmm6, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pxor %xmm9, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm6
-; SSSE3-NEXT: pxor %xmm10, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm9, %xmm5
+; SSSE3-NEXT: psubq %xmm7, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm9, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm9, %xmm7
+; SSSE3-NEXT: movdqa %xmm7, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm5
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm6
-; SSSE3-NEXT: pxor %xmm10, %xmm6
-; SSSE3-NEXT: pandn %xmm6, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: pandn %xmm11, %xmm2
-; SSSE3-NEXT: pand %xmm12, %xmm4
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pand %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm2
+; SSSE3-NEXT: pandn %xmm10, %xmm2
+; SSSE3-NEXT: pand %xmm11, %xmm6
+; SSSE3-NEXT: por %xmm2, %xmm6
+; SSSE3-NEXT: pand %xmm5, %xmm6
; SSSE3-NEXT: pandn %xmm3, %xmm5
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm8, %xmm2
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm2
; SSSE3-NEXT: movdqa %xmm5, %xmm3
; SSSE3-NEXT: retq
;
@@ -2534,293 +1961,198 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm0, %xmm11
+; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: psubq %xmm4, %xmm8
+; SSE41-NEXT: movdqa %xmm8, %xmm10
+; SSE41-NEXT: pxor %xmm9, %xmm10
+; SSE41-NEXT: movdqa %xmm0, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm10, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm12
+; SSE41-NEXT: por %xmm0, %xmm12
; SSE41-NEXT: pxor %xmm9, %xmm4
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm10, %xmm12
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm12, %xmm15
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE41-NEXT: pxor %xmm10, %xmm15
-; SSE41-NEXT: pxor %xmm9, %xmm11
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm11, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
-; SSE41-NEXT: pand %xmm12, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm12
-; SSE41-NEXT: pxor %xmm10, %xmm12
-; SSE41-NEXT: pcmpeqq %xmm12, %xmm15
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT: pand %xmm11, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm12, %xmm4
; SSE41-NEXT: movdqa %xmm9, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT: por %xmm14, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pcmpeqq %xmm12, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pandn %xmm4, %xmm15
-; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movapd %xmm11, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4
-; SSE41-NEXT: movdqa %xmm15, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm10, %xmm11
+; SSE41-NEXT: movdqa %xmm9, %xmm12
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm12
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm12, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: movapd %xmm10, %xmm12
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: psubq %xmm5, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm12
+; SSE41-NEXT: pxor %xmm9, %xmm12
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm12, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm13
+; SSE41-NEXT: por %xmm0, %xmm13
; SSE41-NEXT: pxor %xmm9, %xmm5
-; SSE41-NEXT: movdqa %xmm9, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm14, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm13, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpeqd %xmm12, %xmm13
; SSE41-NEXT: movdqa %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm13, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm14
-; SSE41-NEXT: pxor %xmm10, %xmm14
-; SSE41-NEXT: pcmpeqq %xmm14, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm9, %xmm13
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm10, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm14, %xmm5
-; SSE41-NEXT: pxor %xmm10, %xmm5
-; SSE41-NEXT: pandn %xmm5, %xmm4
-; SSE41-NEXT: movapd %xmm11, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: movapd %xmm10, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: psubq %xmm6, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm12
+; SSE41-NEXT: pxor %xmm9, %xmm12
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm12, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm5
+; SSE41-NEXT: por %xmm0, %xmm5
; SSE41-NEXT: pxor %xmm9, %xmm6
-; SSE41-NEXT: movdqa %xmm9, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm13, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm4
; SSE41-NEXT: movdqa %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm10, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm12, %xmm5
; SSE41-NEXT: movdqa %xmm9, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm14, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm10, %xmm6
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm6
-; SSE41-NEXT: pxor %xmm10, %xmm6
-; SSE41-NEXT: pandn %xmm6, %xmm4
-; SSE41-NEXT: movapd %xmm11, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movapd %xmm10, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: psubq %xmm7, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: pxor %xmm9, %xmm5
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm6
+; SSE41-NEXT: por %xmm0, %xmm6
; SSE41-NEXT: pxor %xmm9, %xmm7
-; SSE41-NEXT: movdqa %xmm9, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm4
-; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm10, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE41-NEXT: pand %xmm12, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm6, %xmm4
; SSE41-NEXT: movdqa %xmm9, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm10, %xmm6
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm6
-; SSE41-NEXT: pxor %xmm10, %xmm6
-; SSE41-NEXT: pandn %xmm6, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
; SSE41-NEXT: movapd %xmm8, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm10, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm6
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm10, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm12
-; AVX1-NEXT: vpcmpeqq %xmm8, %xmm12, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm10, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm11
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm10, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqq %xmm11, %xmm5, %xmm11
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
-; AVX1-NEXT: vpsubq %xmm9, %xmm7, %xmm9
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm10, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6
-; AVX1-NEXT: vpcmpeqq %xmm6, %xmm12, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm11
-; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm10, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6
-; AVX1-NEXT: vpcmpeqq %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5
-; AVX1-NEXT: vandnpd %ymm5, %ymm8, %ymm5
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm7
-; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vblendvpd %ymm7, %ymm8, %ymm11, %ymm7
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT: vblendvpd %ymm5, %ymm7, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm10, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqq %xmm7, %xmm6, %xmm9
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm10, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm10, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqq %xmm12, %xmm7, %xmm12
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9
-; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm12
-; AVX1-NEXT: vpcmpgtq %xmm12, %xmm10, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm10, %xmm3
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpeqq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vandnpd %ymm2, %ymm9, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX1-NEXT: vblendvpd %ymm3, %ymm8, %ymm11, %ymm3
-; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1
-; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vpsubq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; AVX1-NEXT: vxorpd %ymm0, %ymm6, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+; AVX1-NEXT: vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm6, %ymm7
+; AVX1-NEXT: vblendvpd %ymm0, %ymm7, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT: vpsubq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm7
+; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
+; AVX1-NEXT: vxorpd %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm6, %ymm3
+; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpcmpeqq %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm8
-; AVX2-NEXT: vpcmpeqq %ymm8, %ymm7, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpandn %ymm7, %ymm5, %ymm5
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm8 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vblendvpd %ymm2, %ymm7, %ymm8, %ymm2
-; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm3
-; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm4
-; AVX2-NEXT: vpcmpeqq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm3, %ymm7, %ymm8, %ymm3
-; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5
+; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm6, %ymm7
+; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm2
+; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm6, %ymm2
+; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %zmm2, %zmm1, %k0
-; AVX512-NEXT: vpcmpnltq %zmm2, %zmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpcmpnltq %zmm2, %zmm0, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k2
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k2}
-; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpgtq %zmm2, %zmm1, %k0
+; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtq %zmm1, %zmm2, %k2
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k2}
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: retq
%z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
ret <8 x i64> %z
Modified: llvm/trunk/test/CodeGen/X86/vec_saddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_saddo.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_saddo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_saddo.ll Mon Sep 30 00:58:50 2019
@@ -52,67 +52,40 @@ define <2 x i32> @saddo_v2i32(<2 x i32>
; SSE-LABEL: saddo_v2i32:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: movq %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movq %xmm1, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: saddo_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: saddo_v2i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: saddo_v2i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovq %xmm1, (%rdi)
@@ -129,82 +102,45 @@ define <3 x i32> @saddo_v3i32(<3 x i32>
; SSE2-LABEL: saddo_v3i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, 8(%rdi)
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, 8(%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: saddo_v3i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: paddd %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm3, %xmm2
-; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT: movd %xmm0, 8(%rdi)
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: paddd %xmm0, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, 8(%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: saddo_v3i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT: paddd %xmm1, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT: pandn %xmm3, %xmm2
-; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi)
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi)
+; SSE41-NEXT: movq %xmm1, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: saddo_v3i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi)
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -212,17 +148,10 @@ define <3 x i32> @saddo_v3i32(<3 x i32>
; AVX2-LABEL: saddo_v3i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi)
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
@@ -230,13 +159,10 @@ define <3 x i32> @saddo_v3i32(<3 x i32>
; AVX512-LABEL: saddo_v3i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
@@ -254,67 +180,40 @@ define <4 x i32> @saddo_v4i32(<4 x i32>
; SSE-LABEL: saddo_v4i32:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: saddo_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: saddo_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: saddo_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
@@ -331,164 +230,118 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
; SSE2-LABEL: saddo_v6i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE2-NEXT: movd %r8d, %xmm0
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movd %edx, %xmm0
-; SSE2-NEXT: movd %esi, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT: movd %esi, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: movd %r9d, %xmm3
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movd %r9d, %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm4
-; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: paddd %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm2
-; SSE2-NEXT: movq %xmm1, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm0, (%rcx)
-; SSE2-NEXT: movq %xmm2, 16(%rdi)
-; SSE2-NEXT: movdqa %xmm4, (%rdi)
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: movq %xmm2, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm4, (%rcx)
+; SSE2-NEXT: movq %xmm5, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm6, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: saddo_v6i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSSE3-NEXT: movd %r8d, %xmm0
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: movd %edx, %xmm0
-; SSSE3-NEXT: movd %esi, %xmm4
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSSE3-NEXT: movd %esi, %xmm3
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: movd %r9d, %xmm3
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: movd %r9d, %xmm0
+; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6
-; SSSE3-NEXT: paddd %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm4
-; SSSE3-NEXT: pandn %xmm6, %xmm4
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: paddd %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm5, %xmm5
; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6
-; SSSE3-NEXT: paddd %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2
-; SSSE3-NEXT: pandn %xmm6, %xmm2
-; SSSE3-NEXT: movq %xmm1, 16(%rcx)
-; SSSE3-NEXT: movdqa %xmm0, (%rcx)
-; SSSE3-NEXT: movq %xmm2, 16(%rdi)
-; SSSE3-NEXT: movdqa %xmm4, (%rdi)
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm6
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: paddd %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: movq %xmm2, 16(%rcx)
+; SSSE3-NEXT: movdqa %xmm4, (%rcx)
+; SSSE3-NEXT: movq %xmm5, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm6, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: saddo_v6i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
-; SSE41-NEXT: movd %esi, %xmm4
-; SSE41-NEXT: pinsrd $1, %edx, %xmm4
-; SSE41-NEXT: pinsrd $2, %ecx, %xmm4
-; SSE41-NEXT: pinsrd $3, %r8d, %xmm4
-; SSE41-NEXT: movd %r9d, %xmm2
-; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: movd %esi, %xmm1
+; SSE41-NEXT: pinsrd $1, %edx, %xmm1
+; SSE41-NEXT: pinsrd $2, %ecx, %xmm1
+; SSE41-NEXT: pinsrd $3, %r8d, %xmm1
+; SSE41-NEXT: movd %r9d, %xmm0
; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
+; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pxor %xmm6, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE41-NEXT: paddd %xmm4, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
-; SSE41-NEXT: pandn %xmm6, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: paddd %xmm3, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm5, %xmm5
; SSE41-NEXT: pxor %xmm6, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE41-NEXT: paddd %xmm2, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm5, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm3
-; SSE41-NEXT: pandn %xmm6, %xmm3
-; SSE41-NEXT: movq %xmm0, 16(%rcx)
-; SSE41-NEXT: movdqa %xmm1, (%rcx)
-; SSE41-NEXT: movq %xmm3, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm4, (%rdi)
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE41-NEXT: pxor %xmm1, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE41-NEXT: paddd %xmm0, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: movq %xmm2, 16(%rcx)
+; SSE41-NEXT: movdqa %xmm4, (%rcx)
+; SSE41-NEXT: movq %xmm0, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm6, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: saddo_v6i32:
@@ -496,28 +349,15 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm8, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -525,17 +365,10 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
; AVX2-LABEL: saddo_v6i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
@@ -544,13 +377,10 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
; AVX512-LABEL: saddo_v6i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
@@ -568,37 +398,18 @@ define <6 x i32> @saddo_v6i32(<6 x i32>
define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
; SSE-LABEL: saddo_v8i32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE-NEXT: pxor %xmm5, %xmm6
-; SSE-NEXT: pxor %xmm7, %xmm7
-; SSE-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE-NEXT: pxor %xmm5, %xmm7
-; SSE-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm5, %xmm2
-; SSE-NEXT: pcmpeqd %xmm7, %xmm2
-; SSE-NEXT: pandn %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE-NEXT: pxor %xmm5, %xmm6
-; SSE-NEXT: pxor %xmm7, %xmm7
-; SSE-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE-NEXT: pxor %xmm5, %xmm7
-; SSE-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE-NEXT: paddd %xmm3, %xmm4
-; SSE-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE-NEXT: pxor %xmm5, %xmm1
-; SSE-NEXT: pcmpeqd %xmm7, %xmm1
-; SSE-NEXT: pandn %xmm6, %xmm1
-; SSE-NEXT: movdqa %xmm4, 16(%rdi)
-; SSE-NEXT: movdqa %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE-NEXT: paddd %xmm0, %xmm2
+; SSE-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm5, %xmm0
+; SSE-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE-NEXT: paddd %xmm1, %xmm3
+; SSE-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm3, 16(%rdi)
+; SSE-NEXT: movdqa %xmm2, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: saddo_v8i32:
@@ -606,28 +417,15 @@ define <8 x i32> @saddo_v8i32(<8 x i32>
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm8, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -635,30 +433,20 @@ define <8 x i32> @saddo_v8i32(<8 x i32>
; AVX2-LABEL: saddo_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: saddo_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
@@ -674,132 +462,70 @@ define <8 x i32> @saddo_v8i32(<8 x i32>
define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
; SSE-LABEL: saddo_v16i32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm3, %xmm8
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pxor %xmm11, %xmm11
-; SSE-NEXT: pcmpgtd %xmm4, %xmm11
-; SSE-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE-NEXT: pxor %xmm10, %xmm11
-; SSE-NEXT: pxor %xmm12, %xmm12
-; SSE-NEXT: pcmpgtd %xmm0, %xmm12
-; SSE-NEXT: pxor %xmm10, %xmm12
-; SSE-NEXT: pcmpeqd %xmm12, %xmm11
-; SSE-NEXT: paddd %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm8, %xmm8
; SSE-NEXT: pxor %xmm9, %xmm9
-; SSE-NEXT: pcmpgtd %xmm0, %xmm9
-; SSE-NEXT: pxor %xmm10, %xmm9
-; SSE-NEXT: pcmpeqd %xmm12, %xmm9
-; SSE-NEXT: pandn %xmm11, %xmm9
-; SSE-NEXT: pxor %xmm12, %xmm12
-; SSE-NEXT: pcmpgtd %xmm5, %xmm12
-; SSE-NEXT: pxor %xmm10, %xmm12
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE-NEXT: pxor %xmm10, %xmm4
-; SSE-NEXT: pcmpeqd %xmm4, %xmm12
-; SSE-NEXT: paddd %xmm5, %xmm1
-; SSE-NEXT: pxor %xmm11, %xmm11
-; SSE-NEXT: pcmpgtd %xmm1, %xmm11
-; SSE-NEXT: pxor %xmm10, %xmm11
-; SSE-NEXT: pcmpeqd %xmm4, %xmm11
-; SSE-NEXT: pandn %xmm12, %xmm11
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE-NEXT: pxor %xmm10, %xmm4
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE-NEXT: pxor %xmm10, %xmm5
-; SSE-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE-NEXT: paddd %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE-NEXT: pxor %xmm10, %xmm6
-; SSE-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE-NEXT: pandn %xmm4, %xmm6
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE-NEXT: pxor %xmm10, %xmm4
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm8, %xmm5
-; SSE-NEXT: pxor %xmm10, %xmm5
-; SSE-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE-NEXT: paddd %xmm7, %xmm8
-; SSE-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE-NEXT: pxor %xmm10, %xmm3
-; SSE-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE-NEXT: pandn %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm8, 48(%rdi)
-; SSE-NEXT: movdqa %xmm2, 32(%rdi)
-; SSE-NEXT: movdqa %xmm1, 16(%rdi)
-; SSE-NEXT: movdqa %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: movdqa %xmm11, %xmm1
-; SSE-NEXT: movdqa %xmm6, %xmm2
+; SSE-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE-NEXT: paddd %xmm0, %xmm4
+; SSE-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm9, %xmm0
+; SSE-NEXT: pxor %xmm9, %xmm9
+; SSE-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE-NEXT: paddd %xmm1, %xmm5
+; SSE-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm9, %xmm1
+; SSE-NEXT: pxor %xmm9, %xmm9
+; SSE-NEXT: pcmpgtd %xmm6, %xmm9
+; SSE-NEXT: paddd %xmm2, %xmm6
+; SSE-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm9, %xmm2
+; SSE-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE-NEXT: paddd %xmm3, %xmm7
+; SSE-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE-NEXT: pxor %xmm8, %xmm3
+; SSE-NEXT: movdqa %xmm7, 48(%rdi)
+; SSE-NEXT: movdqa %xmm6, 32(%rdi)
+; SSE-NEXT: movdqa %xmm5, 16(%rdi)
+; SSE-NEXT: movdqa %xmm4, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: saddo_v16i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm9, %xmm5, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm5, %xmm4
-; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm8, %xmm4, %xmm8
-; AVX1-NEXT: vpaddd %xmm9, %xmm7, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm9, %xmm5, %xmm7
-; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm8, %xmm4, %xmm8
+; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7
+; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7
-; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm4
-; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm7
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm10
-; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm1
-; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpandn %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm7
-; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm3
-; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm3, %xmm7
-; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm1
-; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpandn %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm3
-; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm7
-; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm0
-; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vmovdqa %xmm9, 48(%rdi)
-; AVX1-NEXT: vmovdqa %xmm10, 32(%rdi)
-; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi)
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
+; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
;
@@ -807,28 +533,15 @@ define <16 x i32> @saddo_v16i32(<16 x i3
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpeqd %ymm1, %ymm7, %ymm1
-; AVX2-NEXT: vpandn %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm4
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm0
-; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm0
-; AVX2-NEXT: vpandn %ymm5, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
@@ -842,13 +555,10 @@ define <16 x i32> @saddo_v16i32(<16 x i3
; AVX512-LABEL: saddo_v16i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k0
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi)
; AVX512-NEXT: retq
@@ -1094,157 +804,42 @@ define <8 x i32> @saddo_v8i16(<8 x i16>
}
define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
-; SSE2-LABEL: saddo_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: paddq %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm0, (%rdi)
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: saddo_v2i64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: paddq %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm1, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, (%rdi)
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: saddo_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: paddq %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm0, (%rdi)
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm2
-; SSE41-NEXT: pandn %xmm4, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: saddo_v2i64:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm3
+; SSE-NEXT: paddq %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT: pand %xmm5, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE-NEXT: por %xmm0, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: retq
;
; AVX1-LABEL: saddo_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -1252,17 +847,10 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
; AVX2-LABEL: saddo_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
@@ -1270,13 +858,10 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
; AVX512-LABEL: saddo_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k0
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
Modified: llvm/trunk/test/CodeGen/X86/vec_ssubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_ssubo.ll?rev=373187&r1=373186&r2=373187&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_ssubo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_ssubo.ll Mon Sep 30 00:58:50 2019
@@ -51,71 +51,42 @@ define <1 x i32> @ssubo_v1i32(<1 x i32>
define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
; SSE-LABEL: ssubo_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE-NEXT: psubd %xmm1, %xmm0
-; SSE-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: movq %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psubd %xmm1, %xmm3
+; SSE-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: movq %xmm3, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: ssubo_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: ssubo_v2i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: ssubo_v2i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovq %xmm1, (%rdi)
@@ -131,87 +102,49 @@ define <2 x i32> @ssubo_v2i32(<2 x i32>
define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
; SSE2-LABEL: ssubo_v3i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, 8(%rdi)
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movq %xmm3, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, 8(%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: ssubo_v3i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: psubd %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm3, %xmm2
-; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT: movd %xmm0, 8(%rdi)
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psubd %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: movq %xmm3, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, 8(%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ssubo_v3i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT: psubd %xmm1, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pandn %xmm3, %xmm2
-; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi)
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psubd %xmm1, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: pextrd $2, %xmm3, 8(%rdi)
+; SSE41-NEXT: movq %xmm3, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: ssubo_v3i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi)
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -219,18 +152,10 @@ define <3 x i32> @ssubo_v3i32(<3 x i32>
; AVX2-LABEL: ssubo_v3i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi)
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
@@ -238,13 +163,10 @@ define <3 x i32> @ssubo_v3i32(<3 x i32>
; AVX512-LABEL: ssubo_v3i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
@@ -261,71 +183,42 @@ define <3 x i32> @ssubo_v3i32(<3 x i32>
define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
; SSE-LABEL: ssubo_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE-NEXT: psubd %xmm1, %xmm0
-; SSE-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psubd %xmm1, %xmm3
+; SSE-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm3, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: ssubo_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: ssubo_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: ssubo_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0
; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
@@ -342,201 +235,132 @@ define <6 x i32> @ssubo_v6i32(<6 x i32>
; SSE2-LABEL: ssubo_v6i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %rdi, %rax
-; SSE2-NEXT: movd %r8d, %xmm0
-; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movd %edx, %xmm2
-; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movd %r8d, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movd %edx, %xmm1
+; SSE2-NEXT: movd %esi, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
; SSE2-NEXT: movd %r9d, %xmm1
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
-; SSE2-NEXT: psubd %xmm6, %xmm0
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE2-NEXT: psubd %xmm4, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm3
-; SSE2-NEXT: pandn %xmm3, %xmm6
-; SSE2-NEXT: movq %xmm1, 16(%rcx)
-; SSE2-NEXT: movdqa %xmm0, (%rcx)
-; SSE2-NEXT: movq %xmm6, 16(%rdi)
-; SSE2-NEXT: movdqa %xmm2, (%rdi)
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psubd %xmm0, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psubd %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movq %xmm3, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm4, (%rcx)
+; SSE2-NEXT: movq %xmm2, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: ssubo_v6i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %rdi, %rax
-; SSSE3-NEXT: movd %r8d, %xmm0
-; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movd %edx, %xmm2
-; SSSE3-NEXT: movd %esi, %xmm0
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movd %r8d, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: movd %edx, %xmm1
+; SSSE3-NEXT: movd %esi, %xmm3
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
; SSSE3-NEXT: movd %r9d, %xmm1
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2
-; SSSE3-NEXT: psubd %xmm6, %xmm0
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6
-; SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSSE3-NEXT: pandn %xmm6, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm7
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6
-; SSSE3-NEXT: psubd %xmm4, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pxor %xmm5, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm3
-; SSSE3-NEXT: pxor %xmm5, %xmm3
-; SSSE3-NEXT: pandn %xmm3, %xmm6
-; SSSE3-NEXT: movq %xmm1, 16(%rcx)
-; SSSE3-NEXT: movdqa %xmm0, (%rcx)
-; SSSE3-NEXT: movq %xmm6, 16(%rdi)
-; SSSE3-NEXT: movdqa %xmm2, (%rdi)
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: psubd %xmm0, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: psubd %xmm2, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: movq %xmm3, 16(%rcx)
+; SSSE3-NEXT: movdqa %xmm4, (%rcx)
+; SSSE3-NEXT: movq %xmm2, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ssubo_v6i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
-; SSE41-NEXT: movd %esi, %xmm0
-; SSE41-NEXT: pinsrd $1, %edx, %xmm0
-; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
-; SSE41-NEXT: pinsrd $3, %r8d, %xmm0
-; SSE41-NEXT: movd %r9d, %xmm1
-; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: movd %esi, %xmm1
+; SSE41-NEXT: pinsrd $1, %edx, %xmm1
+; SSE41-NEXT: pinsrd $2, %ecx, %xmm1
+; SSE41-NEXT: pinsrd $3, %r8d, %xmm1
+; SSE41-NEXT: movd %r9d, %xmm0
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
-; SSE41-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm6
-; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm6
-; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm6
+; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT: pxor %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm2
-; SSE41-NEXT: psubd %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm6, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: pandn %xmm6, %xmm2
-; SSE41-NEXT: pxor %xmm6, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
-; SSE41-NEXT: psubd %xmm3, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: pandn %xmm4, %xmm6
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psubd %xmm3, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm3
+; SSE41-NEXT: pxor %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psubd %xmm2, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: movq %xmm1, 16(%rcx)
-; SSE41-NEXT: movdqa %xmm0, (%rcx)
-; SSE41-NEXT: movq %xmm6, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm2, (%rdi)
+; SSE41-NEXT: movdqa %xmm4, (%rcx)
+; SSE41-NEXT: movq %xmm0, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm3, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: ssubo_v6i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -544,18 +368,10 @@ define <6 x i32> @ssubo_v6i32(<6 x i32>
; AVX2-LABEL: ssubo_v6i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpandn %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
@@ -564,13 +380,10 @@ define <6 x i32> @ssubo_v6i32(<6 x i32>
; AVX512-LABEL: ssubo_v6i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
@@ -588,70 +401,35 @@ define <6 x i32> @ssubo_v6i32(<6 x i32>
define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
; SSE-LABEL: ssubo_v8i32:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm5, %xmm5
; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE-NEXT: pxor %xmm6, %xmm4
-; SSE-NEXT: pxor %xmm7, %xmm7
-; SSE-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE-NEXT: pxor %xmm6, %xmm7
-; SSE-NEXT: pcmpeqd %xmm7, %xmm4
-; SSE-NEXT: psubd %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pcmpeqd %xmm7, %xmm2
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm7
-; SSE-NEXT: pcmpgtd %xmm1, %xmm7
-; SSE-NEXT: pxor %xmm6, %xmm7
-; SSE-NEXT: pcmpeqd %xmm7, %xmm2
-; SSE-NEXT: psubd %xmm3, %xmm1
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: pxor %xmm6, %xmm5
-; SSE-NEXT: pcmpeqd %xmm7, %xmm5
-; SSE-NEXT: pxor %xmm6, %xmm5
-; SSE-NEXT: pandn %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm1, 16(%rdi)
-; SSE-NEXT: movdqa %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: psubd %xmm2, %xmm5
+; SSE-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psubd %xmm3, %xmm2
+; SSE-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm2, 16(%rdi)
+; SSE-NEXT: movdqa %xmm5, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: ssubo_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -659,31 +437,20 @@ define <8 x i32> @ssubo_v8i32(<8 x i32>
; AVX2-LABEL: ssubo_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpandn %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: ssubo_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
@@ -699,128 +466,59 @@ define <8 x i32> @ssubo_v8i32(<8 x i32>
define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
; SSE-LABEL: ssubo_v16i32:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm10, %xmm10
-; SSE-NEXT: pxor %xmm8, %xmm8
-; SSE-NEXT: pcmpgtd %xmm4, %xmm8
-; SSE-NEXT: pcmpeqd %xmm11, %xmm11
-; SSE-NEXT: pxor %xmm11, %xmm8
-; SSE-NEXT: pxor %xmm9, %xmm9
-; SSE-NEXT: pcmpgtd %xmm0, %xmm9
-; SSE-NEXT: pxor %xmm11, %xmm9
-; SSE-NEXT: pcmpeqd %xmm9, %xmm8
-; SSE-NEXT: psubd %xmm4, %xmm0
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE-NEXT: pxor %xmm11, %xmm4
-; SSE-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE-NEXT: pxor %xmm11, %xmm4
-; SSE-NEXT: pandn %xmm4, %xmm8
; SSE-NEXT: pxor %xmm9, %xmm9
-; SSE-NEXT: pcmpgtd %xmm5, %xmm9
-; SSE-NEXT: pxor %xmm11, %xmm9
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE-NEXT: pxor %xmm11, %xmm4
-; SSE-NEXT: pcmpeqd %xmm4, %xmm9
-; SSE-NEXT: psubd %xmm5, %xmm1
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: pxor %xmm11, %xmm5
-; SSE-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE-NEXT: pxor %xmm11, %xmm5
-; SSE-NEXT: pandn %xmm5, %xmm9
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE-NEXT: pxor %xmm11, %xmm4
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE-NEXT: pxor %xmm11, %xmm5
-; SSE-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE-NEXT: psubd %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE-NEXT: pxor %xmm11, %xmm6
-; SSE-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE-NEXT: pxor %xmm11, %xmm6
-; SSE-NEXT: pandn %xmm6, %xmm4
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm7, %xmm5
-; SSE-NEXT: pxor %xmm11, %xmm5
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE-NEXT: pxor %xmm11, %xmm6
-; SSE-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE-NEXT: psubd %xmm7, %xmm3
-; SSE-NEXT: pcmpgtd %xmm3, %xmm10
-; SSE-NEXT: pxor %xmm11, %xmm10
-; SSE-NEXT: pcmpeqd %xmm6, %xmm10
-; SSE-NEXT: pxor %xmm11, %xmm10
-; SSE-NEXT: pandn %xmm10, %xmm5
-; SSE-NEXT: movdqa %xmm3, 48(%rdi)
-; SSE-NEXT: movdqa %xmm2, 32(%rdi)
-; SSE-NEXT: movdqa %xmm1, 16(%rdi)
-; SSE-NEXT: movdqa %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm8, %xmm0
-; SSE-NEXT: movdqa %xmm9, %xmm1
-; SSE-NEXT: movdqa %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: psubd %xmm4, %xmm8
+; SSE-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psubd %xmm5, %xmm4
+; SSE-NEXT: pcmpgtd %xmm9, %xmm5
+; SSE-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: psubd %xmm6, %xmm5
+; SSE-NEXT: pcmpgtd %xmm9, %xmm6
+; SSE-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE-NEXT: pxor %xmm6, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm6
+; SSE-NEXT: psubd %xmm7, %xmm6
+; SSE-NEXT: pcmpgtd %xmm9, %xmm7
+; SSE-NEXT: pcmpgtd %xmm6, %xmm3
+; SSE-NEXT: pxor %xmm7, %xmm3
+; SSE-NEXT: movdqa %xmm6, 48(%rdi)
+; SSE-NEXT: movdqa %xmm5, 32(%rdi)
+; SSE-NEXT: movdqa %xmm4, 16(%rdi)
+; SSE-NEXT: movdqa %xmm8, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: ssubo_v16i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm9, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm9, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm8, %xmm6, %xmm8
-; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm10
-; AVX1-NEXT: vpcmpgtd %xmm10, %xmm9, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpandn %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm9, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm9, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm7
+; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7
+; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm7
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm9, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpandn %xmm1, %xmm7, %xmm1
-; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm9, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm9, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm9, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpandn %xmm1, %xmm6, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm9, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm9, %xmm6
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsubd %xmm6, %xmm4, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm5
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm9, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpandn %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
@@ -830,40 +528,25 @@ define <16 x i32> @ssubo_v16i32(<16 x i3
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-NEXT: vmovdqa %xmm10, 48(%rdi)
+; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
-; AVX1-NEXT: vmovdqa %xmm7, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: ssubo_v16i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm5
; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpeqd %ymm1, %ymm7, %ymm1
-; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1
-; AVX2-NEXT: vpandn %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7
-; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
-; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm4
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm0
-; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm0
-; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0
-; AVX2-NEXT: vpandn %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
@@ -877,13 +560,10 @@ define <16 x i32> @ssubo_v16i32(<16 x i3
; AVX512-LABEL: ssubo_v16i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k0
; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi)
; AVX512-NEXT: retq
@@ -1129,161 +809,42 @@ define <8 x i32> @ssubo_v8i16(<8 x i16>
}
define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
-; SSE2-LABEL: ssubo_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm0, (%rdi)
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: ssubo_v2i64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: psubq %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm1, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, (%rdi)
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: ssubo_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psubq %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm0, (%rdi)
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: pandn %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: ssubo_v2i64:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm3
+; SSE-NEXT: psubq %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT: pand %xmm5, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE-NEXT: por %xmm0, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: retq
;
; AVX1-LABEL: ssubo_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -1291,18 +852,10 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; AVX2-LABEL: ssubo_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
@@ -1310,13 +863,10 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; AVX512-LABEL: ssubo_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpcmpgtq %xmm2, %xmm1, %k0
; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
More information about the llvm-commits
mailing list