[llvm] r353464 - [CodeGen] Handle vector UADDO, SADDO, USUBO, SSUBO
Nikita Popov via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 7 13:02:23 PST 2019
Author: nikic
Date: Thu Feb 7 13:02:22 2019
New Revision: 353464
URL: http://llvm.org/viewvc/llvm-project?rev=353464&view=rev
Log:
[CodeGen] Handle vector UADDO, SADDO, USUBO, SSUBO
This is part of https://bugs.llvm.org/show_bug.cgi?id=40442.
Vector legalization is implemented for the add/sub overflow opcodes.
UMULO/SMULO are also handled as far as legalization is concerned, but
they don't support vector expansion yet (so no tests for them).
The vector result widening implementation is suboptimal, because it
could result in a legalization loop.
Differential Revision: https://reviews.llvm.org/D57639
Added:
llvm/trunk/test/CodeGen/AArch64/vec_uaddo.ll
llvm/trunk/test/CodeGen/X86/vec_saddo.ll
llvm/trunk/test/CodeGen/X86/vec_ssubo.ll
llvm/trunk/test/CodeGen/X86/vec_uaddo.ll
llvm/trunk/test/CodeGen/X86/vec_usubo.ll
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/trunk/test/CodeGen/AMDGPU/saddo.ll
llvm/trunk/test/CodeGen/AMDGPU/ssubo.ll
llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll
llvm/trunk/test/CodeGen/AMDGPU/usubo.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp?rev=353464&r1=353463&r2=353464&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp Thu Feb 7 13:02:22 2019
@@ -881,7 +881,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_
// Calculate the overflow flag: zero extend the arithmetic result from
// the original type.
- SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT);
+ SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT.getScalarType());
// Overflowed if and only if this is not equal to Res.
Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h?rev=353464&r1=353463&r2=353464&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h Thu Feb 7 13:02:22 2019
@@ -674,6 +674,7 @@ private:
SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
SDValue ScalarizeVecRes_StrictFPOp(SDNode *N);
+ SDValue ScalarizeVecRes_OverflowOp(SDNode *N, unsigned ResNo);
SDValue ScalarizeVecRes_InregOp(SDNode *N);
SDValue ScalarizeVecRes_VecInregOp(SDNode *N);
@@ -728,6 +729,8 @@ private:
void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
+ SDValue &Lo, SDValue &Hi);
void SplitVecRes_MULFIX(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -809,6 +812,7 @@ private:
SDValue WidenVecRes_Binary(SDNode *N);
SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
SDValue WidenVecRes_StrictFP(SDNode *N);
+ SDValue WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo);
SDValue WidenVecRes_Convert(SDNode *N);
SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
SDValue WidenVecRes_POWI(SDNode *N);
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp?rev=353464&r1=353463&r2=353464&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Thu Feb 7 13:02:22 2019
@@ -171,6 +171,14 @@ void DAGTypeLegalizer::ScalarizeVectorRe
case ISD::STRICT_FTRUNC:
R = ScalarizeVecRes_StrictFPOp(N);
break;
+ case ISD::UADDO:
+ case ISD::SADDO:
+ case ISD::USUBO:
+ case ISD::SSUBO:
+ case ISD::UMULO:
+ case ISD::SMULO:
+ R = ScalarizeVecRes_OverflowOp(N, ResNo);
+ break;
case ISD::SMULFIX:
case ISD::UMULFIX:
R = ScalarizeVecRes_MULFIX(N);
@@ -235,6 +243,43 @@ SDValue DAGTypeLegalizer::ScalarizeVecRe
return Result;
}
+SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N,
+ unsigned ResNo) {
+ SDLoc DL(N);
+ EVT ResVT = N->getValueType(0);
+ EVT OvVT = N->getValueType(1);
+
+ SDValue ScalarLHS, ScalarRHS;
+ if (getTypeAction(ResVT) == TargetLowering::TypeScalarizeVector) {
+ ScalarLHS = GetScalarizedVector(N->getOperand(0));
+ ScalarRHS = GetScalarizedVector(N->getOperand(1));
+ } else {
+ SmallVector<SDValue, 1> ElemsLHS, ElemsRHS;
+ DAG.ExtractVectorElements(N->getOperand(0), ElemsLHS);
+ DAG.ExtractVectorElements(N->getOperand(1), ElemsRHS);
+ ScalarLHS = ElemsLHS[0];
+ ScalarRHS = ElemsRHS[0];
+ }
+
+ SDVTList ScalarVTs = DAG.getVTList(
+ ResVT.getVectorElementType(), OvVT.getVectorElementType());
+ SDNode *ScalarNode = DAG.getNode(
+ N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode();
+
+ // Replace the other vector result not being explicitly scalarized here.
+ unsigned OtherNo = 1 - ResNo;
+ EVT OtherVT = N->getValueType(OtherNo);
+ if (getTypeAction(OtherVT) == TargetLowering::TypeScalarizeVector) {
+ SetScalarizedVector(SDValue(N, OtherNo), SDValue(ScalarNode, OtherNo));
+ } else {
+ SDValue OtherVal = DAG.getNode(
+ ISD::SCALAR_TO_VECTOR, DL, OtherVT, SDValue(ScalarNode, OtherNo));
+ ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+ }
+
+ return SDValue(ScalarNode, ResNo);
+}
+
SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
unsigned ResNo) {
SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
@@ -859,6 +904,14 @@ void DAGTypeLegalizer::SplitVectorResult
case ISD::STRICT_FTRUNC:
SplitVecRes_StrictFPOp(N, Lo, Hi);
break;
+ case ISD::UADDO:
+ case ISD::SADDO:
+ case ISD::USUBO:
+ case ISD::SSUBO:
+ case ISD::UMULO:
+ case ISD::SMULO:
+ SplitVecRes_OverflowOp(N, ResNo, Lo, Hi);
+ break;
case ISD::SMULFIX:
case ISD::UMULFIX:
SplitVecRes_MULFIX(N, Lo, Hi);
@@ -1205,6 +1258,47 @@ void DAGTypeLegalizer::SplitVecRes_Stric
ReplaceValueWith(SDValue(N, 1), Chain);
}
+void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
+ SDValue &Lo, SDValue &Hi) {
+ SDLoc dl(N);
+ EVT ResVT = N->getValueType(0);
+ EVT OvVT = N->getValueType(1);
+ EVT LoResVT, HiResVT, LoOvVT, HiOvVT;
+ std::tie(LoResVT, HiResVT) = DAG.GetSplitDestVTs(ResVT);
+ std::tie(LoOvVT, HiOvVT) = DAG.GetSplitDestVTs(OvVT);
+
+ SDValue LoLHS, HiLHS, LoRHS, HiRHS;
+ if (getTypeAction(ResVT) == TargetLowering::TypeSplitVector) {
+ GetSplitVector(N->getOperand(0), LoLHS, HiLHS);
+ GetSplitVector(N->getOperand(1), LoRHS, HiRHS);
+ } else {
+ std::tie(LoLHS, HiLHS) = DAG.SplitVectorOperand(N, 0);
+ std::tie(LoRHS, HiRHS) = DAG.SplitVectorOperand(N, 1);
+ }
+
+ unsigned Opcode = N->getOpcode();
+ SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT);
+ SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT);
+ SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode();
+ SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode();
+
+ Lo = SDValue(LoNode, ResNo);
+ Hi = SDValue(HiNode, ResNo);
+
+ // Replace the other vector result not being explicitly split here.
+ unsigned OtherNo = 1 - ResNo;
+ EVT OtherVT = N->getValueType(OtherNo);
+ if (getTypeAction(OtherVT) == TargetLowering::TypeSplitVector) {
+ SetSplitVector(SDValue(N, OtherNo),
+ SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
+ } else {
+ SDValue OtherVal = DAG.getNode(
+ ISD::CONCAT_VECTORS, dl, OtherVT,
+ SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
+ ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+ }
+}
+
void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDValue Vec = N->getOperand(0);
@@ -2471,6 +2565,15 @@ void DAGTypeLegalizer::WidenVectorResult
Res = WidenVecRes_StrictFP(N);
break;
+ case ISD::UADDO:
+ case ISD::SADDO:
+ case ISD::USUBO:
+ case ISD::SSUBO:
+ case ISD::UMULO:
+ case ISD::SMULO:
+ Res = WidenVecRes_OverflowOp(N, ResNo);
+ break;
+
case ISD::FCOPYSIGN:
Res = WidenVecRes_FCOPYSIGN(N);
break;
@@ -2845,6 +2948,58 @@ SDValue DAGTypeLegalizer::WidenVecRes_St
return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT);
}
+SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) {
+ SDLoc DL(N);
+ EVT ResVT = N->getValueType(0);
+ EVT OvVT = N->getValueType(1);
+ EVT WideResVT, WideOvVT;
+ SDValue WideLHS, WideRHS;
+
+ // TODO: This might result in a widen/split loop.
+ if (ResNo == 0) {
+ WideResVT = TLI.getTypeToTransformTo(*DAG.getContext(), ResVT);
+ WideOvVT = EVT::getVectorVT(
+ *DAG.getContext(), OvVT.getVectorElementType(),
+ WideResVT.getVectorNumElements());
+
+ WideLHS = GetWidenedVector(N->getOperand(0));
+ WideRHS = GetWidenedVector(N->getOperand(1));
+ } else {
+ WideOvVT = TLI.getTypeToTransformTo(*DAG.getContext(), OvVT);
+ WideResVT = EVT::getVectorVT(
+ *DAG.getContext(), ResVT.getVectorElementType(),
+ WideOvVT.getVectorNumElements());
+
+ SDValue Zero = DAG.getConstant(
+ 0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()));
+ WideLHS = DAG.getNode(
+ ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT),
+ N->getOperand(0), Zero);
+ WideRHS = DAG.getNode(
+ ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT),
+ N->getOperand(1), Zero);
+ }
+
+ SDVTList WideVTs = DAG.getVTList(WideResVT, WideOvVT);
+ SDNode *WideNode = DAG.getNode(
+ N->getOpcode(), DL, WideVTs, WideLHS, WideRHS).getNode();
+
+ // Replace the other vector result not being explicitly widened here.
+ unsigned OtherNo = 1 - ResNo;
+ EVT OtherVT = N->getValueType(OtherNo);
+ if (getTypeAction(OtherVT) == TargetLowering::TypeWidenVector) {
+ SetWidenedVector(SDValue(N, OtherNo), SDValue(WideNode, OtherNo));
+ } else {
+ SDValue Zero = DAG.getConstant(
+ 0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()));
+ SDValue OtherVal = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, DL, OtherVT, SDValue(WideNode, OtherNo), Zero);
+ ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+ }
+
+ return SDValue(WideNode, ResNo);
+}
+
SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
SDValue InOp = N->getOperand(0);
SDLoc DL(N);
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp?rev=353464&r1=353463&r2=353464&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp Thu Feb 7 13:02:22 2019
@@ -6113,7 +6113,13 @@ SelectionDAGBuilder::visitIntrinsicCall(
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
- SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
+ EVT ResultVT = Op1.getValueType();
+ EVT OverflowVT = MVT::i1;
+ if (ResultVT.isVector())
+ OverflowVT = EVT::getVectorVT(
+ *Context, OverflowVT, ResultVT.getVectorNumElements());
+
+ SDVTList VTs = DAG.getVTList(ResultVT, OverflowVT);
setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
return nullptr;
}
Added: llvm/trunk/test/CodeGen/AArch64/vec_uaddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/vec_uaddo.ll?rev=353464&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/vec_uaddo.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/vec_uaddo.ll Thu Feb 7 13:02:22 2019
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
+
+declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v1.2s, v0.2s, v1.2s
+; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: str s1, [x0]
+; CHECK-NEXT: ret
+ %t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+ %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+ %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+ %res = sext <1 x i1> %obit to <1 x i32>
+ store <1 x i32> %val, <1 x i32>* %p2
+ ret <1 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v1.2s, v0.2s, v1.2s
+; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: str d1, [x0]
+; CHECK-NEXT: ret
+ %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+ %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i32> %val, <2 x i32>* %p2
+ ret <2 x i32> %res
+}
+
+define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: add x8, x0, #8 // =8
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: st1 { v1.s }[2], [x8]
+; CHECK-NEXT: str d1, [x0]
+; CHECK-NEXT: ret
+ %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+ %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+ %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+ %res = sext <3 x i1> %obit to <3 x i32>
+ store <3 x i32> %val, <3 x i32>* %p2
+ ret <3 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: str q1, [x0]
+; CHECK-NEXT: ret
+ %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+ %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i32> %val, <4 x i32>* %p2
+ ret <4 x i32> %res
+}
+
+define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v6i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w6
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: mov v0.s[1], w7
+; CHECK-NEXT: ldr s2, [sp, #16]
+; CHECK-NEXT: ld1 { v0.s }[2], [x8]
+; CHECK-NEXT: add x9, sp, #8 // =8
+; CHECK-NEXT: add x10, sp, #24 // =24
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: ld1 { v2.s }[1], [x10]
+; CHECK-NEXT: ld1 { v0.s }[3], [x9]
+; CHECK-NEXT: mov v1.s[1], w1
+; CHECK-NEXT: fmov s3, w4
+; CHECK-NEXT: ldr x11, [sp, #32]
+; CHECK-NEXT: mov v1.s[2], w2
+; CHECK-NEXT: mov v3.s[1], w5
+; CHECK-NEXT: mov v1.s[3], w3
+; CHECK-NEXT: add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: cmhi v3.4s, v3.4s, v2.4s
+; CHECK-NEXT: cmhi v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: str d2, [x11, #16]
+; CHECK-NEXT: xtn v2.4h, v3.4s
+; CHECK-NEXT: xtn v1.4h, v1.4s
+; CHECK-NEXT: sshll v2.4s, v2.4h, #0
+; CHECK-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-NEXT: mov w5, v2.s[1]
+; CHECK-NEXT: mov w1, v1.s[1]
+; CHECK-NEXT: mov w2, v1.s[2]
+; CHECK-NEXT: mov w3, v1.s[3]
+; CHECK-NEXT: fmov w4, s2
+; CHECK-NEXT: fmov w0, s1
+; CHECK-NEXT: str q0, [x11]
+; CHECK-NEXT: ret
+ %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+ %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+ %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+ %res = sext <6 x i1> %obit to <6 x i32>
+ store <6 x i32> %val, <6 x i32>* %p2
+ ret <6 x i32> %res
+}
+
+define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: cmhi v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: xtn v1.4h, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-NEXT: stp q2, q3, [x0]
+; CHECK-NEXT: ret
+ %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+ %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i32> %val, <8 x i32>* %p2
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v4.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhi v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b
+; CHECK-NEXT: zip2 v2.8b, v0.8b, v0.8b
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-NEXT: zip1 v3.8b, v0.8b, v0.8b
+; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT: shl v1.4s, v1.4s, #31
+; CHECK-NEXT: shl v2.4s, v2.4s, #31
+; CHECK-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-NEXT: ushll v5.4s, v0.4h, #0
+; CHECK-NEXT: sshr v0.4s, v1.4s, #31
+; CHECK-NEXT: sshr v1.4s, v2.4s, #31
+; CHECK-NEXT: shl v2.4s, v3.4s, #31
+; CHECK-NEXT: shl v3.4s, v5.4s, #31
+; CHECK-NEXT: sshr v2.4s, v2.4s, #31
+; CHECK-NEXT: sshr v3.4s, v3.4s, #31
+; CHECK-NEXT: str q4, [x0]
+; CHECK-NEXT: ret
+ %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+ %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+ %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+ %res = sext <16 x i1> %obit to <16 x i32>
+ store <16 x i8> %val, <16 x i8>* %p2
+ ret <16 x i32> %res
+}
+
+define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v2.8h, v0.8h, v1.8h
+; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b
+; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: shl v1.4s, v1.4s, #31
+; CHECK-NEXT: shl v3.4s, v0.4s, #31
+; CHECK-NEXT: sshr v0.4s, v1.4s, #31
+; CHECK-NEXT: sshr v1.4s, v3.4s, #31
+; CHECK-NEXT: str q2, [x0]
+; CHECK-NEXT: ret
+ %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+ %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i16> %val, <8 x i16>* %p2
+ ret <8 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v1.2d, v0.2d, v1.2d
+; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: str q1, [x0]
+; CHECK-NEXT: ret
+ %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+ %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i64> %val, <2 x i64>* %p2
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v4i24:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bic v1.4s, #255, lsl #24
+; CHECK-NEXT: bic v0.4s, #255, lsl #24
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov v1.16b, v0.16b
+; CHECK-NEXT: mov w8, v0.s[3]
+; CHECK-NEXT: bic v1.4s, #255, lsl #24
+; CHECK-NEXT: mov w9, v0.s[2]
+; CHECK-NEXT: mov w10, v0.s[1]
+; CHECK-NEXT: sturh w8, [x0, #9]
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: cmeq v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: fmov w11, s0
+; CHECK-NEXT: strh w9, [x0, #6]
+; CHECK-NEXT: sturh w10, [x0, #3]
+; CHECK-NEXT: lsr w9, w9, #16
+; CHECK-NEXT: lsr w10, w10, #16
+; CHECK-NEXT: strb w8, [x0, #11]
+; CHECK-NEXT: mvn v0.16b, v1.16b
+; CHECK-NEXT: lsr w8, w11, #16
+; CHECK-NEXT: strh w11, [x0]
+; CHECK-NEXT: strb w9, [x0, #8]
+; CHECK-NEXT: strb w10, [x0, #5]
+; CHECK-NEXT: strb w8, [x0, #2]
+; CHECK-NEXT: ret
+ %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+ %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i24> %val, <4 x i24>* %p2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.4h, #1
+; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: add v1.4h, v0.4h, v1.4h
+; CHECK-NEXT: umov w9, v1.h[1]
+; CHECK-NEXT: umov w8, v1.h[0]
+; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: bfi w8, w9, #1, #1
+; CHECK-NEXT: umov w9, v1.h[2]
+; CHECK-NEXT: and v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: cmeq v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: bfi w8, w9, #2, #1
+; CHECK-NEXT: umov w9, v1.h[3]
+; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: bfi w8, w9, #3, #29
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+ %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+ %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i1> %val, <4 x i1>* %p2
+ ret <4 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v2i128:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adds x9, x2, x6
+; CHECK-NEXT: adcs x10, x3, x7
+; CHECK-NEXT: cmp x9, x2
+; CHECK-NEXT: cset w11, lo
+; CHECK-NEXT: cmp x10, x3
+; CHECK-NEXT: cset w12, lo
+; CHECK-NEXT: csel w11, w11, w12, eq
+; CHECK-NEXT: adds x12, x0, x4
+; CHECK-NEXT: adcs x13, x1, x5
+; CHECK-NEXT: cmp x12, x0
+; CHECK-NEXT: cset w14, lo
+; CHECK-NEXT: cmp x13, x1
+; CHECK-NEXT: cset w15, lo
+; CHECK-NEXT: csel w14, w14, w15, eq
+; CHECK-NEXT: ldr x8, [sp]
+; CHECK-NEXT: fmov s0, w14
+; CHECK-NEXT: mov v0.s[1], w11
+; CHECK-NEXT: shl v0.2s, v0.2s, #31
+; CHECK-NEXT: sshr v0.2s, v0.2s, #31
+; CHECK-NEXT: stp x9, x10, [x8, #16]
+; CHECK-NEXT: stp x12, x13, [x8]
+; CHECK-NEXT: ret
+ %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+ %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i128> %val, <2 x i128>* %p2
+ ret <2 x i32> %res
+}
Modified: llvm/trunk/test/CodeGen/AMDGPU/saddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/saddo.ll?rev=353464&r1=353463&r2=353464&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/saddo.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/saddo.ll Thu Feb 7 13:02:22 2019
@@ -1,11 +1,14 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+
+declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
; FUNC-LABEL: {{^}}saddo_i64_zext:
define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
@@ -65,3 +68,22 @@ define amdgpu_kernel void @v_saddo_i64(i
store i1 %carry, i1 addrspace(1)* %carryout
ret void
}
+
+; FUNC-LABEL: {{^}}v_saddo_v2i32:
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_add_{{[iu]}}32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_add_{{[iu]}}32
+define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
+ %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
+ %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+ %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
+ %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
+ %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
+ store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+ %carry.ext = zext <2 x i1> %carry to <2 x i32>
+ store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+ ret void
+}
Modified: llvm/trunk/test/CodeGen/AMDGPU/ssubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ssubo.ll?rev=353464&r1=353463&r2=353464&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ssubo.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ssubo.ll Thu Feb 7 13:02:22 2019
@@ -1,10 +1,11 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
; FUNC-LABEL: {{^}}ssubo_i64_zext:
define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
@@ -70,3 +71,22 @@ define amdgpu_kernel void @v_ssubo_i64(i
store i1 %carry, i1 addrspace(1)* %carryout
ret void
}
+
+; FUNC-LABEL: {{^}}v_ssubo_v2i32:
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_sub_{{[iu]}}32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_sub_{{[iu]}}32
+define amdgpu_kernel void @v_ssubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
+ %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
+ %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+ %sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
+ %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
+ %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
+ store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+ %carry.ext = zext <2 x i1> %carry to <2 x i32>
+ store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+ ret void
+}
Modified: llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll?rev=353464&r1=353463&r2=353464&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll Thu Feb 7 13:02:22 2019
@@ -1,7 +1,6 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}s_uaddo_i64_zext:
; GCN: s_add_u32
@@ -152,10 +151,32 @@ define amdgpu_kernel void @v_uaddo_i16(i
ret void
}
+; FUNC-LABEL: {{^}}v_uaddo_v2i32:
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_add_{{[iu]}}32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_add_{{[iu]}}32
+define amdgpu_kernel void @v_uaddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
+ %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
+ %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+ %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
+ %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
+ %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
+ store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+ %carry.ext = zext <2 x i1> %carry to <2 x i32>
+ store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+ ret void
+}
+
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1
declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1
+declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/usubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/usubo.ll?rev=353464&r1=353463&r2=353464&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/usubo.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/usubo.ll Thu Feb 7 13:02:22 2019
@@ -1,7 +1,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+
; FUNC-LABEL: {{^}}s_usubo_i64_zext:
; GCN: s_sub_u32
@@ -159,10 +159,28 @@ define amdgpu_kernel void @v_usubo_i16(i
ret void
}
+; FUNC-LABEL: {{^}}v_usubo_v2i32:
+; SICIVI: v_sub_{{[iu]}}32
+; SICIVI: v_cndmask_b32
+; SICIVI: v_sub_{{[iu]}}32
+; SICIVI: v_cndmask_b32
+define amdgpu_kernel void @v_usubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
+ %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
+ %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+ %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
+ %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
+ %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
+ store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+ %carry.ext = zext <2 x i1> %carry to <2 x i32>
+ store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare { i16, i1 } @llvm.usub.with.overflow.i16(i16, i16) #1
declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1
+declare { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Added: llvm/trunk/test/CodeGen/X86/vec_saddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_saddo.ll?rev=353464&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_saddo.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vec_saddo.ll Thu Feb 7 13:02:22 2019
@@ -0,0 +1,2028 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @saddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; SSE-LABEL: saddo_v1i32:
+; SSE: # %bb.0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: addl %esi, %edi
+; SSE-NEXT: seto %al
+; SSE-NEXT: negl %eax
+; SSE-NEXT: movl %edi, (%rdx)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: saddo_v1i32:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: addl %esi, %edi
+; AVX-NEXT: seto %al
+; AVX-NEXT: negl %eax
+; AVX-NEXT: movl %edi, (%rdx)
+; AVX-NEXT: retq
+ %t = call {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+ %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+ %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+ %res = sext <1 x i1> %obit to <1 x i32>
+ store <1 x i32> %val, <1 x i32>* %p2
+ ret <1 x i32> %res
+}
+
+define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; SSE2-LABEL: saddo_v2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllq $32, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: paddq %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: saddo_v2i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: psllq $32, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: paddq %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: saddo_v2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: paddq %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllq $32, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE41-NEXT: movq %xmm1, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: saddo_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+ %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i32> %val, <2 x i32>* %p2
+ ret <2 x i32> %res
+}
+
+define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; SSE2-LABEL: saddo_v3i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, 8(%rdi)
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: saddo_v3i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
+; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: pandn %xmm3, %xmm2
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT: movd %xmm0, 8(%rdi)
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: saddo_v3i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT: pandn %xmm3, %xmm2
+; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: saddo_v3i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v3i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v3i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+ %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+ %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+ %res = sext <3 x i1> %obit to <3 x i32>
+ store <3 x i32> %val, <3 x i32>* %p2
+ ret <3 x i32> %res
+}
+
+define <4 x i32> @saddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; SSE-LABEL: saddo_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE-NEXT: pxor %xmm4, %xmm5
+; SSE-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: saddo_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+ %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i32> %val, <4 x i32>* %p2
+ ret <4 x i32> %res
+}
+
+define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; SSE2-LABEL: saddo_v6i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: movd %r8d, %xmm0
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movd %edx, %xmm0
+; SSE2-NEXT: movd %esi, %xmm4
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movd %r9d, %xmm3
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT: pxor %xmm5, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
+; SSE2-NEXT: pxor %xmm5, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm2
+; SSE2-NEXT: movq %xmm1, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm0, (%rcx)
+; SSE2-NEXT: movq %xmm2, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm4, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: saddo_v6i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: movd %r8d, %xmm0
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movd %edx, %xmm0
+; SSSE3-NEXT: movd %esi, %xmm4
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movd %r9d, %xmm3
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
+; SSSE3-NEXT: pxor %xmm5, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6
+; SSSE3-NEXT: paddd %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm4
+; SSSE3-NEXT: pandn %xmm6, %xmm4
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
+; SSSE3-NEXT: pxor %xmm5, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6
+; SSSE3-NEXT: paddd %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2
+; SSSE3-NEXT: pandn %xmm6, %xmm2
+; SSSE3-NEXT: movq %xmm1, 16(%rcx)
+; SSSE3-NEXT: movdqa %xmm0, (%rcx)
+; SSSE3-NEXT: movq %xmm2, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm4, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: saddo_v6i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq %rdi, %rax
+; SSE41-NEXT: movd %esi, %xmm4
+; SSE41-NEXT: pinsrd $1, %edx, %xmm4
+; SSE41-NEXT: pinsrd $2, %ecx, %xmm4
+; SSE41-NEXT: pinsrd $3, %r8d, %xmm4
+; SSE41-NEXT: movd %r9d, %xmm2
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pxor %xmm7, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: paddd %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT: pandn %xmm6, %xmm4
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pxor %xmm7, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: paddd %xmm2, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm5, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm3
+; SSE41-NEXT: pandn %xmm6, %xmm3
+; SSE41-NEXT: movq %xmm0, 16(%rcx)
+; SSE41-NEXT: movdqa %xmm1, (%rcx)
+; SSE41-NEXT: movq %xmm3, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm4, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: saddo_v6i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
+; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
+; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v6i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
+; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v6i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+ %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+ %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+ %res = sext <6 x i1> %obit to <6 x i32>
+ store <6 x i32> %val, <6 x i32>* %p2
+ ret <6 x i32> %res
+}
+
+define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; SSE-LABEL: saddo_v8i32:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE-NEXT: pxor %xmm5, %xmm6
+; SSE-NEXT: pxor %xmm7, %xmm7
+; SSE-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE-NEXT: pxor %xmm5, %xmm7
+; SSE-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm5, %xmm2
+; SSE-NEXT: pcmpeqd %xmm7, %xmm2
+; SSE-NEXT: pandn %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE-NEXT: pxor %xmm5, %xmm6
+; SSE-NEXT: pxor %xmm7, %xmm7
+; SSE-NEXT: pcmpgtd %xmm4, %xmm7
+; SSE-NEXT: pxor %xmm5, %xmm7
+; SSE-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE-NEXT: paddd %xmm3, %xmm4
+; SSE-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: pcmpeqd %xmm7, %xmm1
+; SSE-NEXT: pandn %xmm6, %xmm1
+; SSE-NEXT: movdqa %xmm4, 16(%rdi)
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: saddo_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
+; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
+; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm8, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps %ymm2, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
+; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+ %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i32> %val, <8 x i32>* %p2
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
+; SSE-LABEL: saddo_v16i32:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm3, %xmm8
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: pxor %xmm11, %xmm11
+; SSE-NEXT: pcmpgtd %xmm4, %xmm11
+; SSE-NEXT: pcmpeqd %xmm10, %xmm10
+; SSE-NEXT: pxor %xmm10, %xmm11
+; SSE-NEXT: pxor %xmm12, %xmm12
+; SSE-NEXT: pcmpgtd %xmm0, %xmm12
+; SSE-NEXT: pxor %xmm10, %xmm12
+; SSE-NEXT: pcmpeqd %xmm12, %xmm11
+; SSE-NEXT: paddd %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm9, %xmm9
+; SSE-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE-NEXT: pxor %xmm10, %xmm9
+; SSE-NEXT: pcmpeqd %xmm12, %xmm9
+; SSE-NEXT: pandn %xmm11, %xmm9
+; SSE-NEXT: pxor %xmm12, %xmm12
+; SSE-NEXT: pcmpgtd %xmm5, %xmm12
+; SSE-NEXT: pxor %xmm10, %xmm12
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE-NEXT: pxor %xmm10, %xmm4
+; SSE-NEXT: pcmpeqd %xmm4, %xmm12
+; SSE-NEXT: paddd %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm11, %xmm11
+; SSE-NEXT: pcmpgtd %xmm1, %xmm11
+; SSE-NEXT: pxor %xmm10, %xmm11
+; SSE-NEXT: pcmpeqd %xmm4, %xmm11
+; SSE-NEXT: pandn %xmm12, %xmm11
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE-NEXT: pxor %xmm10, %xmm4
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE-NEXT: pxor %xmm10, %xmm5
+; SSE-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE-NEXT: paddd %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE-NEXT: pxor %xmm10, %xmm6
+; SSE-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE-NEXT: pandn %xmm4, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtd %xmm7, %xmm4
+; SSE-NEXT: pxor %xmm10, %xmm4
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm8, %xmm5
+; SSE-NEXT: pxor %xmm10, %xmm5
+; SSE-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE-NEXT: paddd %xmm7, %xmm8
+; SSE-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE-NEXT: pxor %xmm10, %xmm3
+; SSE-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE-NEXT: pandn %xmm4, %xmm3
+; SSE-NEXT: movdqa %xmm8, 48(%rdi)
+; SSE-NEXT: movdqa %xmm2, 32(%rdi)
+; SSE-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm9, %xmm0
+; SSE-NEXT: movdqa %xmm11, %xmm1
+; SSE-NEXT: movdqa %xmm6, %xmm2
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: saddo_v16i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm9, %xmm5, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm5, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm10
+; AVX1-NEXT: vpcmpeqd %xmm8, %xmm10, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm11
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm11, %xmm6, %xmm11
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
+; AVX1-NEXT: vpaddd %xmm9, %xmm7, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm9, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm10, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm10
+; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3
+; AVX1-NEXT: vandps %ymm3, %ymm8, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpackssdw %xmm6, %xmm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm3, %xmm11
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm12, %xmm7, %xmm12
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11
+; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm2, %ymm11, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm4
+; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmovsxwd %xmm8, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps %ymm4, 32(%rdi)
+; AVX1-NEXT: vmovaps %ymm3, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v16i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5
+; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm7
+; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
+; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm7, %ymm1
+; AVX2-NEXT: vpandn %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5
+; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7
+; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
+; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm0
+; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm0
+; AVX2-NEXT: vpandn %ymm5, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0
+; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
+ %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
+ %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
+ %res = sext <16 x i1> %obit to <16 x i32>
+ store <16 x i32> %val, <16 x i32>* %p2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; SSE2-LABEL: saddo_v16i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqb %xmm5, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pcmpeqb %xmm5, %xmm3
+; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: pslld $31, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: saddo_v16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpgtb %xmm1, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtb %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpeqb %xmm5, %xmm2
+; SSSE3-NEXT: paddb %xmm1, %xmm0
+; SSSE3-NEXT: pcmpgtb %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pcmpeqb %xmm5, %xmm3
+; SSSE3-NEXT: pandn %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSSE3-NEXT: pslld $31, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: pslld $31, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm3
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: saddo_v16i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpgtb %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqb %xmm5, %xmm2
+; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtb %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pcmpeqb %xmm5, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm3
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: saddo_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm6
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmovdqa %xmm6, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm6
+; AVX2-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %xmm6, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltb %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltb %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltb %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+ %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+ %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+ %res = sext <16 x i1> %obit to <16 x i32>
+ store <16 x i8> %val, <16 x i8>* %p2
+ ret <16 x i32> %res
+}
+
+define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; SSE2-LABEL: saddo_v8i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqw %xmm5, %xmm3
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqw %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: saddo_v8i16:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pcmpgtw %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtw %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpeqw %xmm5, %xmm3
+; SSSE3-NEXT: paddw %xmm2, %xmm0
+; SSSE3-NEXT: pcmpgtw %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pcmpeqw %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: pslld $31, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: saddo_v8i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pcmpgtw %xmm2, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpgtw %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqw %xmm5, %xmm3
+; SSE41-NEXT: paddw %xmm2, %xmm0
+; SSE41-NEXT: pcmpgtw %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm5, %xmm1
+; SSE41-NEXT: pandn %xmm3, %xmm1
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: pslld $31, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: saddo_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqw %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqw %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltw %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltw %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltw %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+ %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i16> %val, <8 x i16>* %p2
+ ret <8 x i32> %res
+}
+
+define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; SSE2-LABEL: saddo_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: paddq %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: saddo_v2i64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: paddq %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pandn %xmm3, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: saddo_v2i64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: paddq %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm1, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm4
+; SSE41-NEXT: pxor %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm5
+; SSE41-NEXT: pxor %xmm1, %xmm5
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm0
+; SSE41-NEXT: pandn %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: saddo_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+ %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i64> %val, <2 x i64>* %p2
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; SSE2-LABEL: saddo_v4i24:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pslld $8, %xmm1
+; SSE2-NEXT: psrad $8, %xmm1
+; SSE2-NEXT: pslld $8, %xmm2
+; SSE2-NEXT: psrad $8, %xmm2
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pslld $8, %xmm0
+; SSE2-NEXT: psrad $8, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: movw %ax, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSE2-NEXT: movd %xmm1, %ecx
+; SSE2-NEXT: movw %cx, 9(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, %edx
+; SSE2-NEXT: movw %dx, 6(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE2-NEXT: movd %xmm1, %esi
+; SSE2-NEXT: movw %si, 3(%rdi)
+; SSE2-NEXT: shrl $16, %eax
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: shrl $16, %ecx
+; SSE2-NEXT: movb %cl, 11(%rdi)
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 8(%rdi)
+; SSE2-NEXT: shrl $16, %esi
+; SSE2-NEXT: movb %sil, 5(%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: saddo_v4i24:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pslld $8, %xmm1
+; SSSE3-NEXT: psrad $8, %xmm1
+; SSSE3-NEXT: pslld $8, %xmm2
+; SSSE3-NEXT: psrad $8, %xmm2
+; SSSE3-NEXT: paddd %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pslld $8, %xmm0
+; SSSE3-NEXT: psrad $8, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: movd %xmm2, %eax
+; SSSE3-NEXT: movw %ax, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %ecx
+; SSSE3-NEXT: movw %cx, 9(%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, %edx
+; SSSE3-NEXT: movw %dx, 6(%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %esi
+; SSSE3-NEXT: movw %si, 3(%rdi)
+; SSSE3-NEXT: shrl $16, %eax
+; SSSE3-NEXT: movb %al, 2(%rdi)
+; SSSE3-NEXT: shrl $16, %ecx
+; SSSE3-NEXT: movb %cl, 11(%rdi)
+; SSSE3-NEXT: shrl $16, %edx
+; SSSE3-NEXT: movb %dl, 8(%rdi)
+; SSSE3-NEXT: shrl $16, %esi
+; SSSE3-NEXT: movb %sil, 5(%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: saddo_v4i24:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pslld $8, %xmm1
+; SSE41-NEXT: psrad $8, %xmm1
+; SSE41-NEXT: pslld $8, %xmm2
+; SSE41-NEXT: psrad $8, %xmm2
+; SSE41-NEXT: paddd %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pslld $8, %xmm0
+; SSE41-NEXT: psrad $8, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: pextrd $3, %xmm2, %eax
+; SSE41-NEXT: movw %ax, 9(%rdi)
+; SSE41-NEXT: pextrd $2, %xmm2, %ecx
+; SSE41-NEXT: movw %cx, 6(%rdi)
+; SSE41-NEXT: pextrd $1, %xmm2, %edx
+; SSE41-NEXT: movw %dx, 3(%rdi)
+; SSE41-NEXT: movd %xmm2, %esi
+; SSE41-NEXT: movw %si, (%rdi)
+; SSE41-NEXT: shrl $16, %eax
+; SSE41-NEXT: movb %al, 11(%rdi)
+; SSE41-NEXT: shrl $16, %ecx
+; SSE41-NEXT: movb %cl, 8(%rdi)
+; SSE41-NEXT: shrl $16, %edx
+; SSE41-NEXT: movb %dl, 5(%rdi)
+; SSE41-NEXT: shrl $16, %esi
+; SSE41-NEXT: movb %sil, 2(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: saddo_v4i24:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpslld $8, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $8, %xmm1, %xmm1
+; AVX1-NEXT: vpslld $8, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpslld $8, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrd $3, %xmm1, %eax
+; AVX1-NEXT: movw %ax, 9(%rdi)
+; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT: movw %cx, 6(%rdi)
+; AVX1-NEXT: vpextrd $1, %xmm1, %edx
+; AVX1-NEXT: movw %dx, 3(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %esi
+; AVX1-NEXT: movw %si, (%rdi)
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: movb %al, 11(%rdi)
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: movb %cl, 8(%rdi)
+; AVX1-NEXT: shrl $16, %edx
+; AVX1-NEXT: movb %dl, 5(%rdi)
+; AVX1-NEXT: shrl $16, %esi
+; AVX1-NEXT: movb %sil, 2(%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v4i24:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpslld $8, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $8, %xmm1, %xmm1
+; AVX2-NEXT: vpslld $8, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpslld $8, %xmm1, %xmm0
+; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpextrd $3, %xmm1, %eax
+; AVX2-NEXT: movw %ax, 9(%rdi)
+; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT: movw %cx, 6(%rdi)
+; AVX2-NEXT: vpextrd $1, %xmm1, %edx
+; AVX2-NEXT: movw %dx, 3(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %esi
+; AVX2-NEXT: movw %si, (%rdi)
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: movb %al, 11(%rdi)
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: movb %cl, 8(%rdi)
+; AVX2-NEXT: shrl $16, %edx
+; AVX2-NEXT: movb %dl, 5(%rdi)
+; AVX2-NEXT: shrl $16, %esi
+; AVX2-NEXT: movb %sil, 2(%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v4i24:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $8, %xmm1, %xmm1
+; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1
+; AVX512-NEXT: vpslld $8, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpslld $8, %xmm1, %xmm0
+; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpextrd $3, %xmm1, %eax
+; AVX512-NEXT: movw %ax, 9(%rdi)
+; AVX512-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX512-NEXT: movw %cx, 6(%rdi)
+; AVX512-NEXT: vpextrd $1, %xmm1, %edx
+; AVX512-NEXT: movw %dx, 3(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %esi
+; AVX512-NEXT: movw %si, (%rdi)
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: movb %al, 11(%rdi)
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: movb %cl, 8(%rdi)
+; AVX512-NEXT: shrl $16, %edx
+; AVX512-NEXT: movb %dl, 5(%rdi)
+; AVX512-NEXT: shrl $16, %esi
+; AVX512-NEXT: movb %sil, 2(%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+ %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i24> %val, <4 x i24>* %p2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; SSE-LABEL: saddo_v4i1:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $31, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pslld $31, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movmskps %xmm1, %eax
+; SSE-NEXT: movb %al, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: saddo_v4i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm1, %eax
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v4i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovmskps %xmm1, %eax
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k3
+; AVX512-NEXT: kxorw %k2, %k0, %k0
+; AVX512-NEXT: kxorw %k0, %k1, %k1
+; AVX512-NEXT: kandnw %k3, %k1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: movb %al, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+ %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i1> %val, <4 x i1>* %p2
+ ret <4 x i32> %res
+}
+
+define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; SSE2-LABEL: saddo_v2i128:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: adcq %r11, %rax
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: testq %rcx, %rcx
+; SSE2-NEXT: setns %cl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: setne %bpl
+; SSE2-NEXT: testq %r11, %r11
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: sete %cl
+; SSE2-NEXT: andb %bpl, %cl
+; SSE2-NEXT: movzbl %cl, %ebp
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: testq %rsi, %rsi
+; SSE2-NEXT: setns %cl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: sete %r11b
+; SSE2-NEXT: addq %r8, %rdi
+; SSE2-NEXT: adcq %r9, %rsi
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: setne %cl
+; SSE2-NEXT: andb %r11b, %cl
+; SSE2-NEXT: movzbl %cl, %ecx
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: pinsrw $4, %ebp, %xmm0
+; SSE2-NEXT: movq %rdx, 16(%r10)
+; SSE2-NEXT: movq %rdi, (%r10)
+; SSE2-NEXT: movq %rax, 24(%r10)
+; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: saddo_v2i128:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pushq %rbp
+; SSSE3-NEXT: pushq %rbx
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: adcq %r11, %rax
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: testq %rcx, %rcx
+; SSSE3-NEXT: setns %cl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: setne %bpl
+; SSSE3-NEXT: testq %r11, %r11
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: sete %cl
+; SSSE3-NEXT: andb %bpl, %cl
+; SSSE3-NEXT: movzbl %cl, %ebp
+; SSSE3-NEXT: testq %r9, %r9
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: testq %rsi, %rsi
+; SSSE3-NEXT: setns %cl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: sete %r11b
+; SSSE3-NEXT: addq %r8, %rdi
+; SSSE3-NEXT: adcq %r9, %rsi
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: setne %cl
+; SSSE3-NEXT: andb %r11b, %cl
+; SSSE3-NEXT: movzbl %cl, %ecx
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0
+; SSSE3-NEXT: movq %rdx, 16(%r10)
+; SSSE3-NEXT: movq %rdi, (%r10)
+; SSSE3-NEXT: movq %rax, 24(%r10)
+; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: popq %rbx
+; SSSE3-NEXT: popq %rbp
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: saddo_v2i128:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pushq %rbp
+; SSE41-NEXT: pushq %rbx
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: adcq %r11, %rax
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: testq %rcx, %rcx
+; SSE41-NEXT: setns %cl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: setne %bpl
+; SSE41-NEXT: testq %r11, %r11
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: sete %cl
+; SSE41-NEXT: andb %bpl, %cl
+; SSE41-NEXT: movzbl %cl, %ebp
+; SSE41-NEXT: testq %r9, %r9
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: testq %rsi, %rsi
+; SSE41-NEXT: setns %cl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: sete %r11b
+; SSE41-NEXT: addq %r8, %rdi
+; SSE41-NEXT: adcq %r9, %rsi
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: setne %cl
+; SSE41-NEXT: andb %r11b, %cl
+; SSE41-NEXT: movzbl %cl, %ecx
+; SSE41-NEXT: movd %ecx, %xmm0
+; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%r10)
+; SSE41-NEXT: movq %rdi, (%r10)
+; SSE41-NEXT: movq %rax, 24(%r10)
+; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: popq %rbx
+; SSE41-NEXT: popq %rbp
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: saddo_v2i128:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: adcq %r11, %rax
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: testq %rcx, %rcx
+; AVX1-NEXT: setns %cl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: setne %bpl
+; AVX1-NEXT: testq %r11, %r11
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: sete %cl
+; AVX1-NEXT: andb %bpl, %cl
+; AVX1-NEXT: movzbl %cl, %ebp
+; AVX1-NEXT: testq %r9, %r9
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: testq %rsi, %rsi
+; AVX1-NEXT: setns %cl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: sete %r11b
+; AVX1-NEXT: addq %r8, %rdi
+; AVX1-NEXT: adcq %r9, %rsi
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: setne %cl
+; AVX1-NEXT: andb %r11b, %cl
+; AVX1-NEXT: movzbl %cl, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, 16(%r10)
+; AVX1-NEXT: movq %rdi, (%r10)
+; AVX1-NEXT: movq %rax, 24(%r10)
+; AVX1-NEXT: movq %rsi, 8(%r10)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: saddo_v2i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: adcq %r11, %rax
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: setns %cl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: setne %bpl
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: sete %cl
+; AVX2-NEXT: andb %bpl, %cl
+; AVX2-NEXT: movzbl %cl, %ebp
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: setns %cl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: sete %r11b
+; AVX2-NEXT: addq %r8, %rdi
+; AVX2-NEXT: adcq %r9, %rsi
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: setne %cl
+; AVX2-NEXT: andb %r11b, %cl
+; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, 16(%r10)
+; AVX2-NEXT: movq %rdi, (%r10)
+; AVX2-NEXT: movq %rax, 24(%r10)
+; AVX2-NEXT: movq %rsi, 8(%r10)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: saddo_v2i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq %rcx, %r14
+; AVX512-NEXT: adcq %r11, %r14
+; AVX512-NEXT: setns %bl
+; AVX512-NEXT: testq %rcx, %rcx
+; AVX512-NEXT: setns %cl
+; AVX512-NEXT: cmpb %bl, %cl
+; AVX512-NEXT: setne %bl
+; AVX512-NEXT: testq %r11, %r11
+; AVX512-NEXT: setns %al
+; AVX512-NEXT: cmpb %al, %cl
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: andb %bl, %al
+; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: testq %r9, %r9
+; AVX512-NEXT: setns %al
+; AVX512-NEXT: testq %rsi, %rsi
+; AVX512-NEXT: setns %cl
+; AVX512-NEXT: cmpb %al, %cl
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: addq %r8, %rdi
+; AVX512-NEXT: adcq %r9, %rsi
+; AVX512-NEXT: setns %bl
+; AVX512-NEXT: cmpb %bl, %cl
+; AVX512-NEXT: setne %cl
+; AVX512-NEXT: andb %al, %cl
+; AVX512-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1
+; AVX512-NEXT: movq %rdx, 16(%r10)
+; AVX512-NEXT: movq %rdi, (%r10)
+; AVX512-NEXT: movq %r14, 24(%r10)
+; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
+ %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+ %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i128> %val, <2 x i128>* %p2
+ ret <2 x i32> %res
+}
Added: llvm/trunk/test/CodeGen/X86/vec_ssubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_ssubo.ll?rev=353464&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_ssubo.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vec_ssubo.ll Thu Feb 7 13:02:22 2019
@@ -0,0 +1,2078 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; SSE-LABEL: ssubo_v1i32:
+; SSE: # %bb.0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: subl %esi, %edi
+; SSE-NEXT: seto %al
+; SSE-NEXT: negl %eax
+; SSE-NEXT: movl %edi, (%rdx)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: ssubo_v1i32:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: subl %esi, %edi
+; AVX-NEXT: seto %al
+; AVX-NEXT: negl %eax
+; AVX-NEXT: movl %edi, (%rdx)
+; AVX-NEXT: retq
+ %t = call {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+ %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+ %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+ %res = sext <1 x i1> %obit to <1 x i32>
+ store <1 x i32> %val, <1 x i32>* %p2
+ ret <1 x i32> %res
+}
+
+define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllq $32, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: psubq %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ssubo_v2i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: psllq $32, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: psubq %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: ssubo_v2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: psubq %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllq $32, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE41-NEXT: movq %xmm1, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+ %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i32> %val, <2 x i32>* %p2
+ ret <2 x i32> %res
+}
+
+define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v3i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, 8(%rdi)
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ssubo_v3i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pandn %xmm3, %xmm2
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT: movd %xmm0, 8(%rdi)
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: ssubo_v3i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm3, %xmm2
+; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v3i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v3i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v3i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+ %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+ %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+ %res = sext <3 x i1> %obit to <3 x i32>
+ store <3 x i32> %val, <3 x i32>* %p2
+ ret <3 x i32> %res
+}
+
+define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; SSE-LABEL: ssubo_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE-NEXT: pxor %xmm4, %xmm5
+; SSE-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+ %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i32> %val, <4 x i32>* %p2
+ ret <4 x i32> %res
+}
+
+define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v6i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: movd %r8d, %xmm0
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movd %edx, %xmm2
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
+; SSE2-NEXT: movd %r9d, %xmm1
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT: pxor %xmm5, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
+; SSE2-NEXT: psubd %xmm6, %xmm0
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pxor %xmm5, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE2-NEXT: psubd %xmm4, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm3
+; SSE2-NEXT: pandn %xmm3, %xmm6
+; SSE2-NEXT: movq %xmm1, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm0, (%rcx)
+; SSE2-NEXT: movq %xmm6, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm2, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ssubo_v6i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: movd %r8d, %xmm0
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movd %edx, %xmm2
+; SSSE3-NEXT: movd %esi, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
+; SSSE3-NEXT: movd %r9d, %xmm1
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
+; SSSE3-NEXT: pxor %xmm5, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2
+; SSSE3-NEXT: psubd %xmm6, %xmm0
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: pandn %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pxor %xmm5, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6
+; SSSE3-NEXT: psubd %xmm4, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT: pxor %xmm5, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm3
+; SSSE3-NEXT: pxor %xmm5, %xmm3
+; SSSE3-NEXT: pandn %xmm3, %xmm6
+; SSSE3-NEXT: movq %xmm1, 16(%rcx)
+; SSSE3-NEXT: movdqa %xmm0, (%rcx)
+; SSSE3-NEXT: movq %xmm6, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm2, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: ssubo_v6i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq %rdi, %rax
+; SSE41-NEXT: movd %esi, %xmm0
+; SSE41-NEXT: pinsrd $1, %edx, %xmm0
+; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT: pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT: movd %r9d, %xmm1
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm6
+; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm6
+; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm6
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: pxor %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT: pxor %xmm5, %xmm2
+; SSE41-NEXT: pxor %xmm7, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm2
+; SSE41-NEXT: psubd %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pandn %xmm6, %xmm2
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pxor %xmm7, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: psubd %xmm3, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: pandn %xmm4, %xmm6
+; SSE41-NEXT: movq %xmm1, 16(%rcx)
+; SSE41-NEXT: movdqa %xmm0, (%rcx)
+; SSE41-NEXT: movq %xmm6, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm2, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v6i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm9, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
+; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm8
+; AVX1-NEXT: vpsubd %xmm9, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovq %xmm6, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v6i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
+; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpandn %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v6i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+ %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+ %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+ %res = sext <6 x i1> %obit to <6 x i32>
+ store <6 x i32> %val, <6 x i32>* %p2
+ ret <6 x i32> %res
+}
+
+define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; SSE-LABEL: ssubo_v8i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE-NEXT: pxor %xmm6, %xmm4
+; SSE-NEXT: pxor %xmm7, %xmm7
+; SSE-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE-NEXT: pxor %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE-NEXT: psubd %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm6, %xmm2
+; SSE-NEXT: pcmpeqd %xmm7, %xmm2
+; SSE-NEXT: pxor %xmm6, %xmm2
+; SSE-NEXT: pandn %xmm2, %xmm4
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE-NEXT: pxor %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm7, %xmm7
+; SSE-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE-NEXT: pxor %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm7, %xmm2
+; SSE-NEXT: psubd %xmm3, %xmm1
+; SSE-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE-NEXT: pxor %xmm6, %xmm5
+; SSE-NEXT: pcmpeqd %xmm7, %xmm5
+; SSE-NEXT: pxor %xmm6, %xmm5
+; SSE-NEXT: pandn %xmm5, %xmm2
+; SSE-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm9, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7
+; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm8
+; AVX1-NEXT: vpsubd %xmm9, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm8, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps %ymm2, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5
+; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpandn %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+ %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i32> %val, <8 x i32>* %p2
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
+; SSE-LABEL: ssubo_v16i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm10, %xmm10
+; SSE-NEXT: pxor %xmm8, %xmm8
+; SSE-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE-NEXT: pcmpeqd %xmm11, %xmm11
+; SSE-NEXT: pxor %xmm11, %xmm8
+; SSE-NEXT: pxor %xmm9, %xmm9
+; SSE-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE-NEXT: pxor %xmm11, %xmm9
+; SSE-NEXT: pcmpeqd %xmm9, %xmm8
+; SSE-NEXT: psubd %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE-NEXT: pxor %xmm11, %xmm4
+; SSE-NEXT: pcmpeqd %xmm9, %xmm4
+; SSE-NEXT: pxor %xmm11, %xmm4
+; SSE-NEXT: pandn %xmm4, %xmm8
+; SSE-NEXT: pxor %xmm9, %xmm9
+; SSE-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE-NEXT: pxor %xmm11, %xmm9
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE-NEXT: pxor %xmm11, %xmm4
+; SSE-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE-NEXT: psubd %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE-NEXT: pxor %xmm11, %xmm5
+; SSE-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE-NEXT: pxor %xmm11, %xmm5
+; SSE-NEXT: pandn %xmm5, %xmm9
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE-NEXT: pxor %xmm11, %xmm4
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE-NEXT: pxor %xmm11, %xmm5
+; SSE-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE-NEXT: psubd %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE-NEXT: pxor %xmm11, %xmm6
+; SSE-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE-NEXT: pxor %xmm11, %xmm6
+; SSE-NEXT: pandn %xmm6, %xmm4
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE-NEXT: pxor %xmm11, %xmm5
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE-NEXT: pxor %xmm11, %xmm6
+; SSE-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE-NEXT: psubd %xmm7, %xmm3
+; SSE-NEXT: pcmpgtd %xmm3, %xmm10
+; SSE-NEXT: pxor %xmm11, %xmm10
+; SSE-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE-NEXT: pxor %xmm11, %xmm10
+; SSE-NEXT: pandn %xmm10, %xmm5
+; SSE-NEXT: movdqa %xmm3, 48(%rdi)
+; SSE-NEXT: movdqa %xmm2, 32(%rdi)
+; SSE-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm8, %xmm0
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v16i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm8, %xmm5, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12
+; AVX1-NEXT: vpcmpgtd %xmm12, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm10
+; AVX1-NEXT: vpcmpeqd %xmm9, %xmm10, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm11
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm11, %xmm7, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9
+; AVX1-NEXT: vpsubd %xmm8, %xmm12, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm8, %xmm5, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm10, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm10
+; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT: vandps %ymm3, %ymm9, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpackssdw %xmm6, %xmm3, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm13
+; AVX1-NEXT: vpcmpgtd %xmm13, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm3, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm11
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm12, %xmm7, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm11
+; AVX1-NEXT: vpsubd %xmm13, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm2, %ymm11, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm4
+; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmovsxwd %xmm9, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps %ymm4, 32(%rdi)
+; AVX1-NEXT: vmovaps %ymm3, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v16i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5
+; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm7
+; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
+; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm7, %ymm1
+; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpandn %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5
+; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7
+; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7
+; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm0
+; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm0
+; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpandn %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0
+; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
+ %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
+ %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
+ %res = sext <16 x i1> %obit to <16 x i32>
+ store <16 x i32> %val, <16 x i32>* %p2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v16i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqb %xmm5, %xmm3
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pcmpeqb %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: pslld $31, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ssubo_v16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pcmpgtb %xmm1, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtb %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpeqb %xmm5, %xmm3
+; SSSE3-NEXT: psubb %xmm1, %xmm0
+; SSSE3-NEXT: pcmpgtb %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pcmpeqb %xmm5, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pandn %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSSE3-NEXT: pslld $31, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: pslld $31, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm3
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: ssubo_v16i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpgtb %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqb %xmm5, %xmm3
+; SSE41-NEXT: psubb %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtb %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pcmpeqb %xmm5, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pandn %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm3
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm6
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmovdqa %xmm6, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm6
+; AVX2-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %xmm6, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltb %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltb %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltb %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+ %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+ %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+ %res = sext <16 x i1> %obit to <16 x i32>
+ store <16 x i8> %val, <16 x i8>* %p2
+ ret <16 x i32> %res
+}
+
+define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v8i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqw %xmm5, %xmm1
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtw %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pcmpeqw %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ssubo_v8i16:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pcmpgtw %xmm2, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtw %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpeqw %xmm5, %xmm1
+; SSSE3-NEXT: psubw %xmm2, %xmm0
+; SSSE3-NEXT: pcmpgtw %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pcmpeqw %xmm5, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pandn %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: pslld $31, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: ssubo_v8i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pcmpgtw %xmm2, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpgtw %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqw %xmm5, %xmm1
+; SSE41-NEXT: psubw %xmm2, %xmm0
+; SSE41-NEXT: pcmpgtw %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pcmpeqw %xmm5, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm3, %xmm1
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: pslld $31, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqw %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqw %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltw %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltw %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltw %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+ %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i16> %val, <8 x i16>* %p2
+ ret <8 x i32> %res
+}
+
+define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm0
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ssubo_v2i64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psubq %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
+; SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm5, %xmm0
+; SSSE3-NEXT: pandn %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: ssubo_v2i64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psubq %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm6, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pandn %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+ %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i64> %val, <2 x i64>* %p2
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v4i24:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pslld $8, %xmm1
+; SSE2-NEXT: psrad $8, %xmm1
+; SSE2-NEXT: pslld $8, %xmm2
+; SSE2-NEXT: psrad $8, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pslld $8, %xmm0
+; SSE2-NEXT: psrad $8, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: movw %ax, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSE2-NEXT: movd %xmm1, %ecx
+; SSE2-NEXT: movw %cx, 9(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, %edx
+; SSE2-NEXT: movw %dx, 6(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE2-NEXT: movd %xmm1, %esi
+; SSE2-NEXT: movw %si, 3(%rdi)
+; SSE2-NEXT: shrl $16, %eax
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: shrl $16, %ecx
+; SSE2-NEXT: movb %cl, 11(%rdi)
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 8(%rdi)
+; SSE2-NEXT: shrl $16, %esi
+; SSE2-NEXT: movb %sil, 5(%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ssubo_v4i24:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pslld $8, %xmm1
+; SSSE3-NEXT: psrad $8, %xmm1
+; SSSE3-NEXT: pslld $8, %xmm2
+; SSSE3-NEXT: psrad $8, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pslld $8, %xmm0
+; SSSE3-NEXT: psrad $8, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: movd %xmm2, %eax
+; SSSE3-NEXT: movw %ax, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %ecx
+; SSSE3-NEXT: movw %cx, 9(%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, %edx
+; SSSE3-NEXT: movw %dx, 6(%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %esi
+; SSSE3-NEXT: movw %si, 3(%rdi)
+; SSSE3-NEXT: shrl $16, %eax
+; SSSE3-NEXT: movb %al, 2(%rdi)
+; SSSE3-NEXT: shrl $16, %ecx
+; SSSE3-NEXT: movb %cl, 11(%rdi)
+; SSSE3-NEXT: shrl $16, %edx
+; SSSE3-NEXT: movb %dl, 8(%rdi)
+; SSSE3-NEXT: shrl $16, %esi
+; SSSE3-NEXT: movb %sil, 5(%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: ssubo_v4i24:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pslld $8, %xmm1
+; SSE41-NEXT: psrad $8, %xmm1
+; SSE41-NEXT: pslld $8, %xmm2
+; SSE41-NEXT: psrad $8, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pslld $8, %xmm0
+; SSE41-NEXT: psrad $8, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: pextrd $3, %xmm2, %eax
+; SSE41-NEXT: movw %ax, 9(%rdi)
+; SSE41-NEXT: pextrd $2, %xmm2, %ecx
+; SSE41-NEXT: movw %cx, 6(%rdi)
+; SSE41-NEXT: pextrd $1, %xmm2, %edx
+; SSE41-NEXT: movw %dx, 3(%rdi)
+; SSE41-NEXT: movd %xmm2, %esi
+; SSE41-NEXT: movw %si, (%rdi)
+; SSE41-NEXT: shrl $16, %eax
+; SSE41-NEXT: movb %al, 11(%rdi)
+; SSE41-NEXT: shrl $16, %ecx
+; SSE41-NEXT: movb %cl, 8(%rdi)
+; SSE41-NEXT: shrl $16, %edx
+; SSE41-NEXT: movb %dl, 5(%rdi)
+; SSE41-NEXT: shrl $16, %esi
+; SSE41-NEXT: movb %sil, 2(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v4i24:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpslld $8, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $8, %xmm1, %xmm1
+; AVX1-NEXT: vpslld $8, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpslld $8, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrd $3, %xmm1, %eax
+; AVX1-NEXT: movw %ax, 9(%rdi)
+; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT: movw %cx, 6(%rdi)
+; AVX1-NEXT: vpextrd $1, %xmm1, %edx
+; AVX1-NEXT: movw %dx, 3(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %esi
+; AVX1-NEXT: movw %si, (%rdi)
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: movb %al, 11(%rdi)
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: movb %cl, 8(%rdi)
+; AVX1-NEXT: shrl $16, %edx
+; AVX1-NEXT: movb %dl, 5(%rdi)
+; AVX1-NEXT: shrl $16, %esi
+; AVX1-NEXT: movb %sil, 2(%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v4i24:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpslld $8, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $8, %xmm1, %xmm1
+; AVX2-NEXT: vpslld $8, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpslld $8, %xmm1, %xmm0
+; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpextrd $3, %xmm1, %eax
+; AVX2-NEXT: movw %ax, 9(%rdi)
+; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT: movw %cx, 6(%rdi)
+; AVX2-NEXT: vpextrd $1, %xmm1, %edx
+; AVX2-NEXT: movw %dx, 3(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %esi
+; AVX2-NEXT: movw %si, (%rdi)
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: movb %al, 11(%rdi)
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: movb %cl, 8(%rdi)
+; AVX2-NEXT: shrl $16, %edx
+; AVX2-NEXT: movb %dl, 5(%rdi)
+; AVX2-NEXT: shrl $16, %esi
+; AVX2-NEXT: movb %sil, 2(%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v4i24:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $8, %xmm1, %xmm1
+; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1
+; AVX512-NEXT: vpslld $8, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpslld $8, %xmm1, %xmm0
+; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpextrd $3, %xmm1, %eax
+; AVX512-NEXT: movw %ax, 9(%rdi)
+; AVX512-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX512-NEXT: movw %cx, 6(%rdi)
+; AVX512-NEXT: vpextrd $1, %xmm1, %edx
+; AVX512-NEXT: movw %dx, 3(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %esi
+; AVX512-NEXT: movw %si, (%rdi)
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: movb %al, 11(%rdi)
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: movb %cl, 8(%rdi)
+; AVX512-NEXT: shrl $16, %edx
+; AVX512-NEXT: movb %dl, 5(%rdi)
+; AVX512-NEXT: shrl $16, %esi
+; AVX512-NEXT: movb %sil, 2(%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+ %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i24> %val, <4 x i24>* %p2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; SSE-LABEL: ssubo_v4i1:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $31, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pslld $31, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movmskps %xmm1, %eax
+; SSE-NEXT: movb %al, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v4i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm1, %eax
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v4i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovmskps %xmm1, %eax
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k3
+; AVX512-NEXT: kxorw %k2, %k0, %k0
+; AVX512-NEXT: kxnorw %k0, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k3, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: movb %al, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+ %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i1> %val, <4 x i1>* %p2
+ ret <4 x i32> %res
+}
+
+define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v2i128:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: sbbq %r11, %rax
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: testq %rcx, %rcx
+; SSE2-NEXT: setns %cl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: setne %bpl
+; SSE2-NEXT: testq %r11, %r11
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: setne %cl
+; SSE2-NEXT: andb %bpl, %cl
+; SSE2-NEXT: movzbl %cl, %ebp
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: testq %rsi, %rsi
+; SSE2-NEXT: setns %cl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: setne %r11b
+; SSE2-NEXT: subq %r8, %rdi
+; SSE2-NEXT: sbbq %r9, %rsi
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: setne %cl
+; SSE2-NEXT: andb %r11b, %cl
+; SSE2-NEXT: movzbl %cl, %ecx
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: pinsrw $4, %ebp, %xmm0
+; SSE2-NEXT: movq %rdx, 16(%r10)
+; SSE2-NEXT: movq %rdi, (%r10)
+; SSE2-NEXT: movq %rax, 24(%r10)
+; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ssubo_v2i128:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pushq %rbp
+; SSSE3-NEXT: pushq %rbx
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: sbbq %r11, %rax
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: testq %rcx, %rcx
+; SSSE3-NEXT: setns %cl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: setne %bpl
+; SSSE3-NEXT: testq %r11, %r11
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: setne %cl
+; SSSE3-NEXT: andb %bpl, %cl
+; SSSE3-NEXT: movzbl %cl, %ebp
+; SSSE3-NEXT: testq %r9, %r9
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: testq %rsi, %rsi
+; SSSE3-NEXT: setns %cl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: setne %r11b
+; SSSE3-NEXT: subq %r8, %rdi
+; SSSE3-NEXT: sbbq %r9, %rsi
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: setne %cl
+; SSSE3-NEXT: andb %r11b, %cl
+; SSSE3-NEXT: movzbl %cl, %ecx
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0
+; SSSE3-NEXT: movq %rdx, 16(%r10)
+; SSSE3-NEXT: movq %rdi, (%r10)
+; SSSE3-NEXT: movq %rax, 24(%r10)
+; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: popq %rbx
+; SSSE3-NEXT: popq %rbp
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: ssubo_v2i128:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pushq %rbp
+; SSE41-NEXT: pushq %rbx
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: sbbq %r11, %rax
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: testq %rcx, %rcx
+; SSE41-NEXT: setns %cl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: setne %bpl
+; SSE41-NEXT: testq %r11, %r11
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: setne %cl
+; SSE41-NEXT: andb %bpl, %cl
+; SSE41-NEXT: movzbl %cl, %ebp
+; SSE41-NEXT: testq %r9, %r9
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: testq %rsi, %rsi
+; SSE41-NEXT: setns %cl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: setne %r11b
+; SSE41-NEXT: subq %r8, %rdi
+; SSE41-NEXT: sbbq %r9, %rsi
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: setne %cl
+; SSE41-NEXT: andb %r11b, %cl
+; SSE41-NEXT: movzbl %cl, %ecx
+; SSE41-NEXT: movd %ecx, %xmm0
+; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%r10)
+; SSE41-NEXT: movq %rdi, (%r10)
+; SSE41-NEXT: movq %rax, 24(%r10)
+; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: popq %rbx
+; SSE41-NEXT: popq %rbp
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: ssubo_v2i128:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: sbbq %r11, %rax
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: testq %rcx, %rcx
+; AVX1-NEXT: setns %cl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: setne %bpl
+; AVX1-NEXT: testq %r11, %r11
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: setne %cl
+; AVX1-NEXT: andb %bpl, %cl
+; AVX1-NEXT: movzbl %cl, %ebp
+; AVX1-NEXT: testq %r9, %r9
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: testq %rsi, %rsi
+; AVX1-NEXT: setns %cl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: setne %r11b
+; AVX1-NEXT: subq %r8, %rdi
+; AVX1-NEXT: sbbq %r9, %rsi
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: setne %cl
+; AVX1-NEXT: andb %r11b, %cl
+; AVX1-NEXT: movzbl %cl, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, 16(%r10)
+; AVX1-NEXT: movq %rdi, (%r10)
+; AVX1-NEXT: movq %rax, 24(%r10)
+; AVX1-NEXT: movq %rsi, 8(%r10)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ssubo_v2i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: sbbq %r11, %rax
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: setns %cl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: setne %bpl
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: setne %cl
+; AVX2-NEXT: andb %bpl, %cl
+; AVX2-NEXT: movzbl %cl, %ebp
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: setns %cl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: setne %r11b
+; AVX2-NEXT: subq %r8, %rdi
+; AVX2-NEXT: sbbq %r9, %rsi
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: setne %cl
+; AVX2-NEXT: andb %r11b, %cl
+; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, 16(%r10)
+; AVX2-NEXT: movq %rdi, (%r10)
+; AVX2-NEXT: movq %rax, 24(%r10)
+; AVX2-NEXT: movq %rsi, 8(%r10)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ssubo_v2i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq %rcx, %r14
+; AVX512-NEXT: sbbq %r11, %r14
+; AVX512-NEXT: setns %bl
+; AVX512-NEXT: testq %rcx, %rcx
+; AVX512-NEXT: setns %cl
+; AVX512-NEXT: cmpb %bl, %cl
+; AVX512-NEXT: setne %bl
+; AVX512-NEXT: testq %r11, %r11
+; AVX512-NEXT: setns %al
+; AVX512-NEXT: cmpb %al, %cl
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: andb %bl, %al
+; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: testq %r9, %r9
+; AVX512-NEXT: setns %al
+; AVX512-NEXT: testq %rsi, %rsi
+; AVX512-NEXT: setns %cl
+; AVX512-NEXT: cmpb %al, %cl
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: subq %r8, %rdi
+; AVX512-NEXT: sbbq %r9, %rsi
+; AVX512-NEXT: setns %bl
+; AVX512-NEXT: cmpb %bl, %cl
+; AVX512-NEXT: setne %cl
+; AVX512-NEXT: andb %al, %cl
+; AVX512-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1
+; AVX512-NEXT: movq %rdx, 16(%r10)
+; AVX512-NEXT: movq %rdi, (%r10)
+; AVX512-NEXT: movq %r14, 24(%r10)
+; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
+ %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+ %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i128> %val, <2 x i128>* %p2
+ ret <2 x i32> %res
+}
Added: llvm/trunk/test/CodeGen/X86/vec_uaddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_uaddo.ll?rev=353464&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_uaddo.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vec_uaddo.ll Thu Feb 7 13:02:22 2019
@@ -0,0 +1,1381 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; SSE-LABEL: uaddo_v1i32:
+; SSE: # %bb.0:
+; SSE-NEXT: addl %esi, %edi
+; SSE-NEXT: sbbl %eax, %eax
+; SSE-NEXT: movl %edi, (%rdx)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uaddo_v1i32:
+; AVX: # %bb.0:
+; AVX-NEXT: addl %esi, %edi
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: movl %edi, (%rdx)
+; AVX-NEXT: retq
+ %t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+ %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+ %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+ %res = sext <1 x i1> %obit to <1 x i32>
+ store <1 x i32> %val, <1 x i32>* %p2
+ ret <1 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: paddq %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v2i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: paddq %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: paddq %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: pcmpeqq %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+ %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i32> %val, <2 x i32>* %p2
+ ret <2 x i32> %res
+}
+
+define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v3i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, 8(%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v3i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: paddd %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, 8(%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v3i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: pmaxud %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi)
+; SSE41-NEXT: movq %xmm1, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v3i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v3i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v3i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+ %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+ %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+ %res = sext <3 x i1> %obit to <3 x i32>
+ store <3 x i32> %val, <3 x i32>* %p2
+ ret <3 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v4i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: paddd %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v4i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: pmaxud %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm1, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+ %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i32> %val, <4 x i32>* %p2
+ ret <4 x i32> %res
+}
+
+define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v6i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movd %r8d, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movd %edx, %xmm3
+; SSE2-NEXT: movd %esi, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: movd %r9d, %xmm2
+; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, (%rcx)
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: movq %xmm3, 16(%rcx)
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: movq %xmm2, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm1, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v6i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movd %r8d, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: movd %edx, %xmm3
+; SSSE3-NEXT: movd %esi, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSSE3-NEXT: movd %r9d, %xmm2
+; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, (%rcx)
+; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT: paddd %xmm2, %xmm3
+; SSSE3-NEXT: movq %xmm3, 16(%rcx)
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: movq %xmm2, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm1, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v6i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq %rdi, %rax
+; SSE41-NEXT: movd %esi, %xmm0
+; SSE41-NEXT: pinsrd $1, %edx, %xmm0
+; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT: pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: movd %r9d, %xmm2
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
+; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: paddd %xmm0, %xmm3
+; SSE41-NEXT: pmaxud %xmm3, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm0
+; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: pmaxud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: movq %xmm1, 16(%rcx)
+; SSE41-NEXT: movdqa %xmm3, (%rcx)
+; SSE41-NEXT: movq %xmm2, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v6i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v6i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v6i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+ %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+ %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+ %res = sext <6 x i1> %obit to <6 x i32>
+ store <6 x i32> %val, <6 x i32>* %p2
+ ret <6 x i32> %res
+}
+
+define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v8i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm2, (%rdi)
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm3, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v8i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: paddd %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, (%rdi)
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: paddd %xmm1, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm3, 16(%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v8i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: paddd %xmm0, %xmm2
+; SSE41-NEXT: pmaxud %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm3
+; SSE41-NEXT: pmaxud %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm3, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm2, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps %ymm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+ %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i32> %val, <8 x i32>* %p2
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v16i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: paddd %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm4, (%rdi)
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm5, 16(%rdi)
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm6
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm6, 32(%rdi)
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm7
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: pxor %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm7, 48(%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v16i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: paddd %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm0
+; SSSE3-NEXT: movdqa %xmm4, (%rdi)
+; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0
+; SSSE3-NEXT: paddd %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm8, %xmm1
+; SSSE3-NEXT: movdqa %xmm5, 16(%rdi)
+; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT: paddd %xmm2, %xmm6
+; SSSE3-NEXT: pxor %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm6, 32(%rdi)
+; SSSE3-NEXT: pxor %xmm8, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT: paddd %xmm3, %xmm7
+; SSSE3-NEXT: pxor %xmm8, %xmm3
+; SSSE3-NEXT: pxor %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm7, 48(%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v16i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: paddd %xmm0, %xmm4
+; SSE41-NEXT: pmaxud %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE41-NEXT: pxor %xmm8, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm5
+; SSE41-NEXT: pmaxud %xmm5, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE41-NEXT: pxor %xmm8, %xmm1
+; SSE41-NEXT: paddd %xmm2, %xmm6
+; SSE41-NEXT: pmaxud %xmm6, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
+; SSE41-NEXT: pxor %xmm8, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm7
+; SSE41-NEXT: pmaxud %xmm7, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm3
+; SSE41-NEXT: pxor %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm7, 48(%rdi)
+; SSE41-NEXT: movdqa %xmm6, 32(%rdi)
+; SSE41-NEXT: movdqa %xmm5, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm4, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v16i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmaxud %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpmaxud %xmm7, %xmm5, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vmovaps %ymm3, 32(%rdi)
+; AVX1-NEXT: vmovaps %ymm2, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v16i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpmaxud %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpmaxud %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpltud %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
+ %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
+ %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
+ %res = sext <16 x i1> %obit to <16 x i32>
+ store <16 x i32> %val, <16 x i32>* %p2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v16i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: pmaxub %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: movdqa %xmm1, (%rdi)
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: paddb %xmm0, %xmm1
+; SSSE3-NEXT: pmaxub %xmm1, %xmm0
+; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: pslld $31, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: pslld $31, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm3
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, (%rdi)
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v16i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: pmaxub %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm3
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: movdqa %xmm1, (%rdi)
+; SSE41-NEXT: movdqa %xmm4, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpmaxub %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpmaxub %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpltub %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+ %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+ %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+ %res = sext <16 x i1> %obit to <16 x i32>
+ store <16 x i8> %val, <16 x i8>* %p2
+ ret <16 x i32> %res
+}
+
+define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v8i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: movdqa %xmm1, (%rdi)
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v8i16:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT: paddw %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: pslld $31, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: movdqa %xmm1, (%rdi)
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v8i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: paddw %xmm0, %xmm1
+; SSE41-NEXT: pmaxuw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm2
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT: pslld $31, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE41-NEXT: pslld $31, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: movdqa %xmm1, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpltuw %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+ %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i16> %val, <8 x i16>* %p2
+ ret <8 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; SSE-LABEL: uaddo_v2i64:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT: paddq %xmm0, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpltuq %xmm0, %xmm1, %k1
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+ %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i64> %val, <2 x i64>* %p2
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v4i24:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: movw %ax, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSE2-NEXT: movd %xmm1, %ecx
+; SSE2-NEXT: movw %cx, 9(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, %edx
+; SSE2-NEXT: movw %dx, 6(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE2-NEXT: movd %xmm1, %esi
+; SSE2-NEXT: movw %si, 3(%rdi)
+; SSE2-NEXT: shrl $16, %eax
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: shrl $16, %ecx
+; SSE2-NEXT: movb %cl, 11(%rdi)
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 8(%rdi)
+; SSE2-NEXT: shrl $16, %esi
+; SSE2-NEXT: movb %sil, 5(%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v4i24:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pand %xmm3, %xmm2
+; SSSE3-NEXT: paddd %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: movd %xmm2, %eax
+; SSSE3-NEXT: movw %ax, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %ecx
+; SSSE3-NEXT: movw %cx, 9(%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, %edx
+; SSSE3-NEXT: movw %dx, 6(%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %esi
+; SSSE3-NEXT: movw %si, 3(%rdi)
+; SSSE3-NEXT: shrl $16, %eax
+; SSSE3-NEXT: movb %al, 2(%rdi)
+; SSSE3-NEXT: shrl $16, %ecx
+; SSSE3-NEXT: movb %cl, 11(%rdi)
+; SSSE3-NEXT: shrl $16, %edx
+; SSSE3-NEXT: movb %dl, 8(%rdi)
+; SSSE3-NEXT: shrl $16, %esi
+; SSSE3-NEXT: movb %sil, 5(%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v4i24:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: movw %ax, 9(%rdi)
+; SSE41-NEXT: pextrd $2, %xmm0, %ecx
+; SSE41-NEXT: movw %cx, 6(%rdi)
+; SSE41-NEXT: pextrd $1, %xmm0, %edx
+; SSE41-NEXT: movw %dx, 3(%rdi)
+; SSE41-NEXT: movd %xmm0, %esi
+; SSE41-NEXT: movw %si, (%rdi)
+; SSE41-NEXT: shrl $16, %eax
+; SSE41-NEXT: movb %al, 11(%rdi)
+; SSE41-NEXT: shrl $16, %ecx
+; SSE41-NEXT: movb %cl, 8(%rdi)
+; SSE41-NEXT: shrl $16, %edx
+; SSE41-NEXT: movb %dl, 5(%rdi)
+; SSE41-NEXT: shrl $16, %esi
+; SSE41-NEXT: movb %sil, 2(%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v4i24:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2.35098856E-38,2.35098856E-38,2.35098856E-38,2.35098856E-38]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrd $3, %xmm1, %eax
+; AVX1-NEXT: movw %ax, 9(%rdi)
+; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT: movw %cx, 6(%rdi)
+; AVX1-NEXT: vpextrd $1, %xmm1, %edx
+; AVX1-NEXT: movw %dx, 3(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %esi
+; AVX1-NEXT: movw %si, (%rdi)
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: movb %al, 11(%rdi)
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: movb %cl, 8(%rdi)
+; AVX1-NEXT: shrl $16, %edx
+; AVX1-NEXT: movb %dl, 5(%rdi)
+; AVX1-NEXT: shrl $16, %esi
+; AVX1-NEXT: movb %sil, 2(%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v4i24:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpextrd $3, %xmm1, %eax
+; AVX2-NEXT: movw %ax, 9(%rdi)
+; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT: movw %cx, 6(%rdi)
+; AVX2-NEXT: vpextrd $1, %xmm1, %edx
+; AVX2-NEXT: movw %dx, 3(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %esi
+; AVX2-NEXT: movw %si, (%rdi)
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: movb %al, 11(%rdi)
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: movb %cl, 8(%rdi)
+; AVX2-NEXT: shrl $16, %edx
+; AVX2-NEXT: movb %dl, 5(%rdi)
+; AVX2-NEXT: shrl $16, %esi
+; AVX2-NEXT: movb %sil, 2(%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v4i24:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpextrd $3, %xmm1, %eax
+; AVX512-NEXT: movw %ax, 9(%rdi)
+; AVX512-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX512-NEXT: movw %cx, 6(%rdi)
+; AVX512-NEXT: vpextrd $1, %xmm1, %edx
+; AVX512-NEXT: movw %dx, 3(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %esi
+; AVX512-NEXT: movw %si, (%rdi)
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: movb %al, 11(%rdi)
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: movb %cl, 8(%rdi)
+; AVX512-NEXT: shrl $16, %edx
+; AVX512-NEXT: movb %dl, 5(%rdi)
+; AVX512-NEXT: shrl $16, %esi
+; AVX512-NEXT: movb %sil, 2(%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+ %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i24> %val, <4 x i24>* %p2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; SSE-LABEL: uaddo_v4i1:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: movmskps %xmm0, %eax
+; SSE-NEXT: movb %al, (%rdi)
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v4i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vmovmskps %xmm1, %eax
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v4i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vmovmskps %xmm1, %eax
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k2
+; AVX512-NEXT: kxnorw %k1, %k0, %k1
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kmovd %k2, %eax
+; AVX512-NEXT: movb %al, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+ %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i1> %val, <4 x i1>* %p2
+ ret <4 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v2i128:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: setb %al
+; SSE2-NEXT: movzbl %al, %r11d
+; SSE2-NEXT: addq %r8, %rdi
+; SSE2-NEXT: adcq %r9, %rsi
+; SSE2-NEXT: setb %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pinsrw $4, %r11d, %xmm0
+; SSE2-NEXT: movq %rdx, 16(%r10)
+; SSE2-NEXT: movq %rdi, (%r10)
+; SSE2-NEXT: movq %rcx, 24(%r10)
+; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: uaddo_v2i128:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: setb %al
+; SSSE3-NEXT: movzbl %al, %r11d
+; SSSE3-NEXT: addq %r8, %rdi
+; SSSE3-NEXT: adcq %r9, %rsi
+; SSSE3-NEXT: setb %al
+; SSSE3-NEXT: movzbl %al, %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0
+; SSSE3-NEXT: movq %rdx, 16(%r10)
+; SSSE3-NEXT: movq %rdi, (%r10)
+; SSSE3-NEXT: movq %rcx, 24(%r10)
+; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: uaddo_v2i128:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: movzbl %al, %r11d
+; SSE41-NEXT: addq %r8, %rdi
+; SSE41-NEXT: adcq %r9, %rsi
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pinsrb $8, %r11d, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%r10)
+; SSE41-NEXT: movq %rdi, (%r10)
+; SSE41-NEXT: movq %rcx, 24(%r10)
+; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: uaddo_v2i128:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: movzbl %al, %r11d
+; AVX1-NEXT: addq %r8, %rdi
+; AVX1-NEXT: adcq %r9, %rsi
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: movzbl %al, %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, 16(%r10)
+; AVX1-NEXT: movq %rdi, (%r10)
+; AVX1-NEXT: movq %rcx, 24(%r10)
+; AVX1-NEXT: movq %rsi, 8(%r10)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v2i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: movzbl %al, %r11d
+; AVX2-NEXT: addq %r8, %rdi
+; AVX2-NEXT: adcq %r9, %rsi
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, 16(%r10)
+; AVX2-NEXT: movq %rdi, (%r10)
+; AVX2-NEXT: movq %rcx, 24(%r10)
+; AVX2-NEXT: movq %rsi, 8(%r10)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: uaddo_v2i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: setb %al
+; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: addq %r8, %rdi
+; AVX512-NEXT: adcq %r9, %rsi
+; AVX512-NEXT: setb %al
+; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1
+; AVX512-NEXT: movq %rdx, 16(%r10)
+; AVX512-NEXT: movq %rdi, (%r10)
+; AVX512-NEXT: movq %rcx, 24(%r10)
+; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+ %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i128> %val, <2 x i128>* %p2
+ ret <2 x i32> %res
+}
Added: llvm/trunk/test/CodeGen/X86/vec_usubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_usubo.ll?rev=353464&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_usubo.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vec_usubo.ll Thu Feb 7 13:02:22 2019
@@ -0,0 +1,1422 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @usubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; SSE-LABEL: usubo_v1i32:
+; SSE: # %bb.0:
+; SSE-NEXT: subl %esi, %edi
+; SSE-NEXT: sbbl %eax, %eax
+; SSE-NEXT: movl %edi, (%rdx)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: usubo_v1i32:
+; AVX: # %bb.0:
+; AVX-NEXT: subl %esi, %edi
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: movl %edi, (%rdx)
+; AVX-NEXT: retq
+ %t = call {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+ %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+ %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+ %res = sext <1 x i1> %obit to <1 x i32>
+ store <1 x i32> %val, <1 x i32>* %p2
+ ret <1 x i32> %res
+}
+
+define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v2i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: psubq %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: psubq %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: pcmpeqq %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+ %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i32> %val, <2 x i32>* %p2
+ ret <2 x i32> %res
+}
+
+define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v3i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, 8(%rdi)
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v3i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT: movd %xmm0, 8(%rdi)
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v3i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: pminud %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: pextrd $2, %xmm2, 8(%rdi)
+; SSE41-NEXT: movq %xmm2, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v3i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v3i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v3i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+ %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+ %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+ %res = sext <3 x i1> %obit to <3 x i32>
+ store <3 x i32> %val, <3 x i32>* %p2
+ ret <3 x i32> %res
+}
+
+define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v4i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v4i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: pminud %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+ %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i32> %val, <4 x i32>* %p2
+ ret <4 x i32> %res
+}
+
+define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v6i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: movd %r8d, %xmm0
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movd %edx, %xmm3
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: movd %r9d, %xmm1
+; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psubd %xmm2, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm4, (%rcx)
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm3, %xmm0
+; SSE2-NEXT: movq %xmm0, 16(%rcx)
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: movq %xmm0, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm4, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v6i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT: movd %r8d, %xmm0
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movd %edx, %xmm3
+; SSSE3-NEXT: movd %esi, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT: movd %r9d, %xmm1
+; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: psubd %xmm2, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm4, (%rcx)
+; SSSE3-NEXT: pxor %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: psubd %xmm3, %xmm0
+; SSSE3-NEXT: movq %xmm0, 16(%rcx)
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT: movq %xmm0, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm4, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v6i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq %rdi, %rax
+; SSE41-NEXT: movd %esi, %xmm0
+; SSE41-NEXT: pinsrd $1, %edx, %xmm0
+; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT: pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: movd %r9d, %xmm2
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
+; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psubd %xmm3, %xmm4
+; SSE41-NEXT: pminud %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: psubd %xmm1, %xmm5
+; SSE41-NEXT: pminud %xmm5, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: movq %xmm5, 16(%rcx)
+; SSE41-NEXT: movdqa %xmm4, (%rcx)
+; SSE41-NEXT: movq %xmm2, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v6i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v6i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v6i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpnleud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+ %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+ %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+ %res = sext <6 x i1> %obit to <6 x i32>
+ store <6 x i32> %val, <6 x i32>* %p2
+ ret <6 x i32> %res
+}
+
+define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v8i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: psubd %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v8i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: psubd %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: psubd %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v8i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psubd %xmm2, %xmm4
+; SSE41-NEXT: pminud %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psubd %xmm3, %xmm5
+; SSE41-NEXT: pminud %xmm5, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm5, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm4, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps %ymm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpnleud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+ %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i32> %val, <8 x i32>* %p2
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v16i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: psubd %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: psubd %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: psubd %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: psubd %xmm7, %xmm3
+; SSE2-NEXT: pxor %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
+; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v16i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm9
+; SSSE3-NEXT: pxor %xmm8, %xmm9
+; SSSE3-NEXT: psubd %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: pxor %xmm8, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: psubd %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT: pxor %xmm8, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: psubd %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, 32(%rdi)
+; SSSE3-NEXT: pxor %xmm8, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: psubd %xmm7, %xmm3
+; SSSE3-NEXT: pxor %xmm3, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8
+; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
+; SSSE3-NEXT: movdqa %xmm8, %xmm3
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v16i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: psubd %xmm4, %xmm8
+; SSE41-NEXT: pminud %xmm8, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm9
+; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psubd %xmm5, %xmm4
+; SSE41-NEXT: pminud %xmm4, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm9, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: psubd %xmm6, %xmm5
+; SSE41-NEXT: pminud %xmm5, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT: pxor %xmm9, %xmm2
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: psubd %xmm7, %xmm6
+; SSE41-NEXT: pminud %xmm6, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm3
+; SSE41-NEXT: pxor %xmm9, %xmm3
+; SSE41-NEXT: movdqa %xmm6, 48(%rdi)
+; SSE41-NEXT: movdqa %xmm5, 32(%rdi)
+; SSE41-NEXT: movdqa %xmm4, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm8, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v16i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpsubd %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpminud %xmm7, %xmm5, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm5, %xmm7
+; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vmovaps %ymm3, 32(%rdi)
+; AVX1-NEXT: vmovaps %ymm2, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v16i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpminud %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpminud %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpnleud %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
+ %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
+ %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
+ %res = sext <16 x i1> %obit to <16 x i32>
+ store <16 x i32> %val, <16 x i32>* %p2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; SSE2-LABEL: usubo_v16i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psubb %xmm1, %xmm4
+; SSE2-NEXT: pminub %xmm4, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: movdqa %xmm4, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: psubb %xmm1, %xmm4
+; SSSE3-NEXT: pminub %xmm4, %xmm0
+; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: pslld $31, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: pslld $31, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm3
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: movdqa %xmm4, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v16i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psubb %xmm1, %xmm4
+; SSE41-NEXT: pminub %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm3
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: movdqa %xmm4, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpminub %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpminub %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnleub %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+ %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+ %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+ %res = sext <16 x i1> %obit to <16 x i32>
+ store <16 x i8> %val, <16 x i8>* %p2
+ ret <16 x i32> %res
+}
+
+define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; SSE2-LABEL: usubo_v8i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v8i16:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: psubw %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT: pslld $31, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v8i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubw %xmm1, %xmm2
+; SSE41-NEXT: pminuw %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm0, %xmm1
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: pslld $31, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: movdqa %xmm2, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnleuw %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+ %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+ %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+ %res = sext <8 x i1> %obit to <8 x i32>
+ store <8 x i16> %val, <8 x i16>* %p2
+ ret <8 x i32> %res
+}
+
+define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; SSE-LABEL: usubo_v2i64:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm3
+; SSE-NEXT: psubq %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT: por %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: usubo_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnleuq %xmm0, %xmm1, %k1
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+ %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i64> %val, <2 x i64>* %p2
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; SSE2-LABEL: usubo_v4i24:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: movw %ax, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSE2-NEXT: movd %xmm1, %ecx
+; SSE2-NEXT: movw %cx, 9(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, %edx
+; SSE2-NEXT: movw %dx, 6(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE2-NEXT: movd %xmm1, %esi
+; SSE2-NEXT: movw %si, 3(%rdi)
+; SSE2-NEXT: shrl $16, %eax
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: shrl $16, %ecx
+; SSE2-NEXT: movb %cl, 11(%rdi)
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 8(%rdi)
+; SSE2-NEXT: shrl $16, %esi
+; SSE2-NEXT: movb %sil, 5(%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v4i24:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pand %xmm3, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: movd %xmm2, %eax
+; SSSE3-NEXT: movw %ax, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %ecx
+; SSSE3-NEXT: movw %cx, 9(%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, %edx
+; SSSE3-NEXT: movw %dx, 6(%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %esi
+; SSSE3-NEXT: movw %si, 3(%rdi)
+; SSSE3-NEXT: shrl $16, %eax
+; SSSE3-NEXT: movb %al, 2(%rdi)
+; SSSE3-NEXT: shrl $16, %ecx
+; SSSE3-NEXT: movb %cl, 11(%rdi)
+; SSSE3-NEXT: shrl $16, %edx
+; SSSE3-NEXT: movb %dl, 8(%rdi)
+; SSSE3-NEXT: shrl $16, %esi
+; SSSE3-NEXT: movb %sil, 5(%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v4i24:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: movw %ax, 9(%rdi)
+; SSE41-NEXT: pextrd $2, %xmm0, %ecx
+; SSE41-NEXT: movw %cx, 6(%rdi)
+; SSE41-NEXT: pextrd $1, %xmm0, %edx
+; SSE41-NEXT: movw %dx, 3(%rdi)
+; SSE41-NEXT: movd %xmm0, %esi
+; SSE41-NEXT: movw %si, (%rdi)
+; SSE41-NEXT: shrl $16, %eax
+; SSE41-NEXT: movb %al, 11(%rdi)
+; SSE41-NEXT: shrl $16, %ecx
+; SSE41-NEXT: movb %cl, 8(%rdi)
+; SSE41-NEXT: shrl $16, %edx
+; SSE41-NEXT: movb %dl, 5(%rdi)
+; SSE41-NEXT: shrl $16, %esi
+; SSE41-NEXT: movb %sil, 2(%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v4i24:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2.35098856E-38,2.35098856E-38,2.35098856E-38,2.35098856E-38]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrd $3, %xmm1, %eax
+; AVX1-NEXT: movw %ax, 9(%rdi)
+; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT: movw %cx, 6(%rdi)
+; AVX1-NEXT: vpextrd $1, %xmm1, %edx
+; AVX1-NEXT: movw %dx, 3(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %esi
+; AVX1-NEXT: movw %si, (%rdi)
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: movb %al, 11(%rdi)
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: movb %cl, 8(%rdi)
+; AVX1-NEXT: shrl $16, %edx
+; AVX1-NEXT: movb %dl, 5(%rdi)
+; AVX1-NEXT: shrl $16, %esi
+; AVX1-NEXT: movb %sil, 2(%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v4i24:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpextrd $3, %xmm1, %eax
+; AVX2-NEXT: movw %ax, 9(%rdi)
+; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT: movw %cx, 6(%rdi)
+; AVX2-NEXT: vpextrd $1, %xmm1, %edx
+; AVX2-NEXT: movw %dx, 3(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %esi
+; AVX2-NEXT: movw %si, (%rdi)
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: movb %al, 11(%rdi)
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: movb %cl, 8(%rdi)
+; AVX2-NEXT: shrl $16, %edx
+; AVX2-NEXT: movb %dl, 5(%rdi)
+; AVX2-NEXT: shrl $16, %esi
+; AVX2-NEXT: movb %sil, 2(%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v4i24:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpextrd $3, %xmm1, %eax
+; AVX512-NEXT: movw %ax, 9(%rdi)
+; AVX512-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX512-NEXT: movw %cx, 6(%rdi)
+; AVX512-NEXT: vpextrd $1, %xmm1, %edx
+; AVX512-NEXT: movw %dx, 3(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %esi
+; AVX512-NEXT: movw %si, (%rdi)
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: movb %al, 11(%rdi)
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: movb %cl, 8(%rdi)
+; AVX512-NEXT: shrl $16, %edx
+; AVX512-NEXT: movb %dl, 5(%rdi)
+; AVX512-NEXT: shrl $16, %esi
+; AVX512-NEXT: movb %sil, 2(%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+ %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i24> %val, <4 x i24>* %p2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; SSE-LABEL: usubo_v4i1:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: movmskps %xmm0, %eax
+; SSE-NEXT: movb %al, (%rdi)
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: usubo_v4i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vmovmskps %xmm1, %eax
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v4i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vmovmskps %xmm1, %eax
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k1
+; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
+; AVX512-NEXT: kmovd %k1, %eax
+; AVX512-NEXT: movb %al, (%rdi)
+; AVX512-NEXT: retq
+ %t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+ %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+ %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+ %res = sext <4 x i1> %obit to <4 x i32>
+ store <4 x i1> %val, <4 x i1>* %p2
+ ret <4 x i32> %res
+}
+
+define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; SSE2-LABEL: usubo_v2i128:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: setb %al
+; SSE2-NEXT: movzbl %al, %r11d
+; SSE2-NEXT: subq %r8, %rdi
+; SSE2-NEXT: sbbq %r9, %rsi
+; SSE2-NEXT: setb %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pinsrw $4, %r11d, %xmm0
+; SSE2-NEXT: movq %rdx, 16(%r10)
+; SSE2-NEXT: movq %rdi, (%r10)
+; SSE2-NEXT: movq %rcx, 24(%r10)
+; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: usubo_v2i128:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: setb %al
+; SSSE3-NEXT: movzbl %al, %r11d
+; SSSE3-NEXT: subq %r8, %rdi
+; SSSE3-NEXT: sbbq %r9, %rsi
+; SSSE3-NEXT: setb %al
+; SSSE3-NEXT: movzbl %al, %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0
+; SSSE3-NEXT: movq %rdx, 16(%r10)
+; SSSE3-NEXT: movq %rdi, (%r10)
+; SSSE3-NEXT: movq %rcx, 24(%r10)
+; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: usubo_v2i128:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: movzbl %al, %r11d
+; SSE41-NEXT: subq %r8, %rdi
+; SSE41-NEXT: sbbq %r9, %rsi
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pinsrb $8, %r11d, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%r10)
+; SSE41-NEXT: movq %rdi, (%r10)
+; SSE41-NEXT: movq %rcx, 24(%r10)
+; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: usubo_v2i128:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: movzbl %al, %r11d
+; AVX1-NEXT: subq %r8, %rdi
+; AVX1-NEXT: sbbq %r9, %rsi
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: movzbl %al, %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, 16(%r10)
+; AVX1-NEXT: movq %rdi, (%r10)
+; AVX1-NEXT: movq %rcx, 24(%r10)
+; AVX1-NEXT: movq %rsi, 8(%r10)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v2i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: movzbl %al, %r11d
+; AVX2-NEXT: subq %r8, %rdi
+; AVX2-NEXT: sbbq %r9, %rsi
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, 16(%r10)
+; AVX2-NEXT: movq %rdi, (%r10)
+; AVX2-NEXT: movq %rcx, 24(%r10)
+; AVX2-NEXT: movq %rsi, 8(%r10)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: usubo_v2i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: setb %al
+; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: subq %r8, %rdi
+; AVX512-NEXT: sbbq %r9, %rsi
+; AVX512-NEXT: setb %al
+; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1
+; AVX512-NEXT: movq %rdx, 16(%r10)
+; AVX512-NEXT: movq %rdi, (%r10)
+; AVX512-NEXT: movq %rcx, 24(%r10)
+; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+ %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+ %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+ %res = sext <2 x i1> %obit to <2 x i32>
+ store <2 x i128> %val, <2 x i128>* %p2
+ ret <2 x i32> %res
+}
More information about the llvm-commits
mailing list