[llvm-branch-commits] [llvm] 6dbd0d3 - [DAG] Move vselect(icmp_ult, -1, add(x, y)) -> uaddsat(x, y) to DAGCombine (PR40111)
Simon Pilgrim via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Dec 1 04:03:58 PST 2020
Author: Simon Pilgrim
Date: 2020-12-01T11:56:26Z
New Revision: 6dbd0d36a1729e129bb11647b91bdb615d42c98c
URL: https://github.com/llvm/llvm-project/commit/6dbd0d36a1729e129bb11647b91bdb615d42c98c
DIFF: https://github.com/llvm/llvm-project/commit/6dbd0d36a1729e129bb11647b91bdb615d42c98c.diff
LOG: [DAG] Move vselect(icmp_ult, -1, add(x,y)) -> uaddsat(x,y) to DAGCombine (PR40111)
Move the X86 VSELECT->UADDSAT fold to DAGCombiner - there's nothing target specific about these folds.
The SSE42 test diffs are relatively benign - its avoiding an extra constant load in exchange for an extra xor operation - there are extra register moves, which is annoying as all those operations should commute them away.
Differential Revision: https://reviews.llvm.org/D91876
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/AArch64/sat-add.ll
llvm/test/CodeGen/PowerPC/sat-add.ll
llvm/test/CodeGen/X86/sat-add.ll
llvm/test/CodeGen/X86/uadd_sat_vec.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1b5debfe602e..7a87521ae344 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9698,6 +9698,51 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
}
}
+
+ // Match VSELECTs into add with unsigned saturation.
+ if (hasOperation(ISD::UADDSAT, VT)) {
+ // Check if one of the arms of the VSELECT is vector with all bits set.
+ // If it's on the left side invert the predicate to simplify logic below.
+ SDValue Other;
+ ISD::CondCode SatCC = CC;
+ if (ISD::isBuildVectorAllOnes(N1.getNode())) {
+ Other = N2;
+ SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType());
+ } else if (ISD::isBuildVectorAllOnes(N2.getNode())) {
+ Other = N1;
+ }
+
+ if (Other && Other.getOpcode() == ISD::ADD) {
+ SDValue CondLHS = LHS, CondRHS = RHS;
+ SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
+
+ // Canonicalize condition operands.
+ if (SatCC == ISD::SETUGE) {
+ std::swap(CondLHS, CondRHS);
+ SatCC = ISD::SETULE;
+ }
+
+ // We can test against either of the addition operands.
+ // x <= x+y ? x+y : ~0 --> uaddsat x, y
+ // x+y >= x ? x+y : ~0 --> uaddsat x, y
+ if (SatCC == ISD::SETULE && Other == CondRHS &&
+ (OpLHS == CondLHS || OpRHS == CondLHS))
+ return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
+
+ if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
+ CondLHS == OpLHS) {
+ // If the RHS is a constant we have to reverse the const
+ // canonicalization.
+ // x >= ~C ? x+C : ~0 --> uaddsat x, C
+ auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
+ return Cond->getAPIntValue() == ~Op->getAPIntValue();
+ };
+ if (SatCC == ISD::SETULE &&
+ ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
+ return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
+ }
+ }
+ }
}
if (SimplifySelectOps(N, N1, N2))
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e45a311f84a4..d27ada4c4b38 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7525,13 +7525,13 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
assert(VT.isInteger() && "Expected operands to be integers");
// usub.sat(a, b) -> umax(a, b) - b
- if (Opcode == ISD::USUBSAT && isOperationLegalOrCustom(ISD::UMAX, VT)) {
+ if (Opcode == ISD::USUBSAT && isOperationLegal(ISD::UMAX, VT)) {
SDValue Max = DAG.getNode(ISD::UMAX, dl, VT, LHS, RHS);
return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
}
// uadd.sat(a, b) -> umin(a, ~b) + b
- if (Opcode == ISD::UADDSAT && isOperationLegalOrCustom(ISD::UMIN, VT)) {
+ if (Opcode == ISD::UADDSAT && isOperationLegal(ISD::UMIN, VT)) {
SDValue InvRHS = DAG.getNOT(dl, RHS, VT);
SDValue Min = DAG.getNode(ISD::UMIN, dl, VT, LHS, InvRHS);
return DAG.getNode(ISD::ADD, dl, VT, Min, RHS);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1274582614ed..ce3497934a07 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -922,9 +922,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
- setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
- setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
@@ -1103,6 +1101,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
+ setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
@@ -1143,6 +1143,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
+ setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
+ }
+
if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -26889,17 +26893,6 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
EVT SetCCResultType =
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
- if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
- // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
- SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
- SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
- // TODO: Move this to DAGCombiner?
- if (SetCCResultType == VT &&
- DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
- return DAG.getNode(ISD::OR, DL, VT, Cmp, Add);
- return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
- }
-
if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
// usubsat X, Y --> (X >u Y) ? X - Y : 0
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
@@ -40988,59 +40981,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
- // Match VSELECTs into add with unsigned saturation.
- if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
- // paddus is available in SSE2 for i8 and i16 vectors.
- Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
- isPowerOf2_32(VT.getVectorNumElements()) &&
- (VT.getVectorElementType() == MVT::i8 ||
- VT.getVectorElementType() == MVT::i16)) {
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
- SDValue CondLHS = Cond->getOperand(0);
- SDValue CondRHS = Cond->getOperand(1);
-
- // Check if one of the arms of the VSELECT is vector with all bits set.
- // If it's on the left side invert the predicate to simplify logic below.
- SDValue Other;
- if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
- Other = RHS;
- CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
- } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
- Other = LHS;
- }
-
- if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
- SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
-
- // Canonicalize condition operands.
- if (CC == ISD::SETUGE) {
- std::swap(CondLHS, CondRHS);
- CC = ISD::SETULE;
- }
-
- // We can test against either of the addition operands.
- // x <= x+y ? x+y : ~0 --> addus x, y
- // x+y >= x ? x+y : ~0 --> addus x, y
- if (CC == ISD::SETULE && Other == CondRHS &&
- (OpLHS == CondLHS || OpRHS == CondLHS))
- return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
-
- if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
- CondLHS == OpLHS) {
- // If the RHS is a constant we have to reverse the const
- // canonicalization.
- // x > ~C ? x+C : ~0 --> addus x, C
- auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
- return Cond->getAPIntValue() == ~Op->getAPIntValue();
- };
- if (CC == ISD::SETULE &&
- ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
- return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
- }
- }
- }
-
// Check if the first operand is all zeros and Cond type is vXi1.
// If this an avx512 target we can improve the use of zero masking by
// swapping the operands and inverting the condition.
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index c029ffdbcc5d..c38f505be5d7 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -360,9 +360,7 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_sum(<16 x i8> %x) {
; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.16b, #42
-; CHECK-NEXT: add v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
%c = icmp ugt <16 x i8> %x, %a
@@ -374,10 +372,7 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_notval(<16 x i8> %x) {
; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.16b, #42
-; CHECK-NEXT: movi v2.16b, #213
-; CHECK-NEXT: add v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmhi v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
%c = icmp ugt <16 x i8> %x, <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
@@ -403,9 +398,7 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_sum(<8 x i16> %x) {
; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.8h, #42
-; CHECK-NEXT: add v1.8h, v0.8h, v1.8h
-; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
%c = icmp ugt <8 x i16> %x, %a
@@ -417,10 +410,7 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_notval(<8 x i16> %x) {
; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.8h, #42
-; CHECK-NEXT: mvni v2.8h, #42
-; CHECK-NEXT: add v1.8h, v0.8h, v1.8h
-; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
%c = icmp ugt <8 x i16> %x, <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
@@ -446,9 +436,7 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_sum(<4 x i32> %x) {
; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #42
-; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
%c = icmp ugt <4 x i32> %x, %a
@@ -460,10 +448,7 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval(<4 x i32> %x) {
; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #42
-; CHECK-NEXT: mvni v2.4s, #42
-; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
%c = icmp ugt <4 x i32> %x, <i32 -43, i32 -43, i32 -43, i32 -43>
@@ -493,9 +478,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #42
; CHECK-NEXT: dup v1.2d, x8
-; CHECK-NEXT: add v1.2d, v0.2d, v1.2d
-; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%a = add <2 x i64> %x, <i64 42, i64 42>
%c = icmp ugt <2 x i64> %x, %a
@@ -507,12 +490,8 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
; CHECK-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #42
-; CHECK-NEXT: mov x9, #-43
; CHECK-NEXT: dup v1.2d, x8
-; CHECK-NEXT: dup v2.2d, x9
-; CHECK-NEXT: add v1.2d, v0.2d, v1.2d
-; CHECK-NEXT: cmhi v0.2d, v0.2d, v2.2d
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%a = add <2 x i64> %x, <i64 42, i64 42>
%c = icmp ugt <2 x i64> %x, <i64 -43, i64 -43>
@@ -537,9 +516,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_min(<16 x i8> %x, <16 x i8>
define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%a = add <16 x i8> %x, %y
%c = icmp ugt <16 x i8> %x, %a
@@ -579,9 +556,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16>
define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v1.8h, v0.8h, v1.8h
-; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%a = add <8 x i16> %x, %y
%c = icmp ugt <8 x i16> %x, %a
@@ -621,9 +596,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32>
define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%a = add <4 x i32> %x, %y
%c = icmp ugt <4 x i32> %x, %a
@@ -664,9 +637,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64>
define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v1.2d, v0.2d, v1.2d
-; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%a = add <2 x i64> %x, %y
%c = icmp ugt <2 x i64> %x, %a
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index e0816f71dc60..ab2b8d18fb6e 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -396,12 +396,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_sum(<16 x i8> %x) {
; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: addis 3, 2, .LCPI25_0 at toc@ha
-; CHECK-NEXT: xxleqv 0, 0, 0
; CHECK-NEXT: addi 3, 3, .LCPI25_0 at toc@l
; CHECK-NEXT: lvx 3, 0, 3
-; CHECK-NEXT: vaddubm 3, 2, 3
-; CHECK-NEXT: vcmpgtub 2, 2, 3
-; CHECK-NEXT: xxsel 34, 35, 0, 34
+; CHECK-NEXT: vaddubs 2, 2, 3
; CHECK-NEXT: blr
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
%c = icmp ugt <16 x i8> %x, %a
@@ -412,16 +409,10 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_sum(<16 x i8> %x) {
define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_notval(<16 x i8> %x) {
; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval:
; CHECK: # %bb.0:
-; CHECK-NEXT: addis 3, 2, .LCPI26_1 at toc@ha
-; CHECK-NEXT: xxleqv 0, 0, 0
-; CHECK-NEXT: addi 3, 3, .LCPI26_1 at toc@l
-; CHECK-NEXT: lvx 3, 0, 3
; CHECK-NEXT: addis 3, 2, .LCPI26_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI26_0 at toc@l
-; CHECK-NEXT: vcmpgtub 3, 2, 3
-; CHECK-NEXT: lvx 4, 0, 3
-; CHECK-NEXT: vaddubm 2, 2, 4
-; CHECK-NEXT: xxsel 34, 34, 0, 35
+; CHECK-NEXT: lvx 3, 0, 3
+; CHECK-NEXT: vaddubs 2, 2, 3
; CHECK-NEXT: blr
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
%c = icmp ugt <16 x i8> %x, <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
@@ -451,12 +442,9 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_sum(<8 x i16> %x) {
; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: addis 3, 2, .LCPI28_0 at toc@ha
-; CHECK-NEXT: xxleqv 0, 0, 0
; CHECK-NEXT: addi 3, 3, .LCPI28_0 at toc@l
; CHECK-NEXT: lvx 3, 0, 3
-; CHECK-NEXT: vadduhm 3, 2, 3
-; CHECK-NEXT: vcmpgtuh 2, 2, 3
-; CHECK-NEXT: xxsel 34, 35, 0, 34
+; CHECK-NEXT: vadduhs 2, 2, 3
; CHECK-NEXT: blr
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
%c = icmp ugt <8 x i16> %x, %a
@@ -467,16 +455,10 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_sum(<8 x i16> %x) {
define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_notval(<8 x i16> %x) {
; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval:
; CHECK: # %bb.0:
-; CHECK-NEXT: addis 3, 2, .LCPI29_1 at toc@ha
-; CHECK-NEXT: xxleqv 0, 0, 0
-; CHECK-NEXT: addi 3, 3, .LCPI29_1 at toc@l
-; CHECK-NEXT: lvx 3, 0, 3
; CHECK-NEXT: addis 3, 2, .LCPI29_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI29_0 at toc@l
-; CHECK-NEXT: vcmpgtuh 3, 2, 3
-; CHECK-NEXT: lvx 4, 0, 3
-; CHECK-NEXT: vadduhm 2, 2, 4
-; CHECK-NEXT: xxsel 34, 34, 0, 35
+; CHECK-NEXT: lvx 3, 0, 3
+; CHECK-NEXT: vadduhs 2, 2, 3
; CHECK-NEXT: blr
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
%c = icmp ugt <8 x i16> %x, <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
@@ -506,12 +488,9 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_sum(<4 x i32> %x) {
; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: addis 3, 2, .LCPI31_0 at toc@ha
-; CHECK-NEXT: xxleqv 0, 0, 0
; CHECK-NEXT: addi 3, 3, .LCPI31_0 at toc@l
; CHECK-NEXT: lvx 3, 0, 3
-; CHECK-NEXT: vadduwm 3, 2, 3
-; CHECK-NEXT: vcmpgtuw 2, 2, 3
-; CHECK-NEXT: xxsel 34, 35, 0, 34
+; CHECK-NEXT: vadduws 2, 2, 3
; CHECK-NEXT: blr
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
%c = icmp ugt <4 x i32> %x, %a
@@ -522,16 +501,10 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_sum(<4 x i32> %x) {
define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval(<4 x i32> %x) {
; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval:
; CHECK: # %bb.0:
-; CHECK-NEXT: addis 3, 2, .LCPI32_1 at toc@ha
-; CHECK-NEXT: xxleqv 0, 0, 0
-; CHECK-NEXT: addi 3, 3, .LCPI32_1 at toc@l
-; CHECK-NEXT: lvx 3, 0, 3
; CHECK-NEXT: addis 3, 2, .LCPI32_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI32_0 at toc@l
-; CHECK-NEXT: vcmpgtuw 3, 2, 3
-; CHECK-NEXT: lvx 4, 0, 3
-; CHECK-NEXT: vadduwm 2, 2, 4
-; CHECK-NEXT: xxsel 34, 34, 0, 35
+; CHECK-NEXT: lvx 3, 0, 3
+; CHECK-NEXT: vadduws 2, 2, 3
; CHECK-NEXT: blr
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
%c = icmp ugt <4 x i32> %x, <i32 -43, i32 -43, i32 -43, i32 -43>
@@ -616,10 +589,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_min(<16 x i8> %x, <16 x i8>
define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vaddubm 3, 2, 3
-; CHECK-NEXT: xxleqv 0, 0, 0
-; CHECK-NEXT: vcmpgtub 2, 2, 3
-; CHECK-NEXT: xxsel 34, 35, 0, 34
+; CHECK-NEXT: vaddubs 2, 2, 3
; CHECK-NEXT: blr
%a = add <16 x i8> %x, %y
%c = icmp ugt <16 x i8> %x, %a
@@ -660,10 +630,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16>
define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vadduhm 3, 2, 3
-; CHECK-NEXT: xxleqv 0, 0, 0
-; CHECK-NEXT: vcmpgtuh 2, 2, 3
-; CHECK-NEXT: xxsel 34, 35, 0, 34
+; CHECK-NEXT: vadduhs 2, 2, 3
; CHECK-NEXT: blr
%a = add <8 x i16> %x, %y
%c = icmp ugt <8 x i16> %x, %a
@@ -704,10 +671,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32>
define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vadduwm 3, 2, 3
-; CHECK-NEXT: xxleqv 0, 0, 0
-; CHECK-NEXT: vcmpgtuw 2, 2, 3
-; CHECK-NEXT: xxsel 34, 35, 0, 34
+; CHECK-NEXT: vadduws 2, 2, 3
; CHECK-NEXT: blr
%a = add <4 x i32> %x, %y
%c = icmp ugt <4 x i32> %x, %a
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index 1dae100e0994..149f3cea0b7e 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -385,19 +385,10 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_sum(<16 x i8> %x) {
; SSE-NEXT: paddusb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddusb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum:
+; AVX: # %bb.0:
+; AVX-NEXT: vpaddusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
%c = icmp ugt <16 x i8> %x, %a
%r = select <16 x i1> %c, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %a
@@ -410,18 +401,10 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_notval(<16 x i8> %x) {
; SSE-NEXT: paddusb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddusb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval:
+; AVX: # %bb.0:
+; AVX-NEXT: vpaddusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
%c = icmp ugt <16 x i8> %x, <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
%r = select <16 x i1> %c, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %a
@@ -460,19 +443,10 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_sum(<8 x i16> %x) {
; SSE-NEXT: paddusw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum:
+; AVX: # %bb.0:
+; AVX-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
%c = icmp ugt <8 x i16> %x, %a
%r = select <8 x i1> %c, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %a
@@ -485,18 +459,10 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_notval(<8 x i16> %x) {
; SSE-NEXT: paddusw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vpmaxuw {{.*}}(%rip), %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval:
+; AVX: # %bb.0:
+; AVX-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
%c = icmp ugt <8 x i16> %x, <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
%r = select <8 x i1> %c, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %a
@@ -555,35 +521,22 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_sum(<4 x i32> %x) {
;
; SSE4-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [42,42,42,42]
-; SSE4-NEXT: paddd %xmm0, %xmm2
-; SSE4-NEXT: movdqa %xmm0, %xmm1
-; SSE4-NEXT: pminud %xmm2, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT: pxor %xmm0, %xmm1
-; SSE4-NEXT: por %xmm2, %xmm1
-; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0
+; SSE4-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE4-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967253,4294967253,4294967253,4294967253]
+; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleud %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; AVX512-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: retq
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
%c = icmp ugt <4 x i32> %x, %a
@@ -603,31 +556,22 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval(<4 x i32> %x) {
;
; SSE4-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42]
-; SSE4-NEXT: paddd %xmm0, %xmm1
-; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967254,4294967254,4294967254,4294967254]
-; SSE4-NEXT: pmaxud %xmm0, %xmm2
-; SSE4-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE4-NEXT: por %xmm1, %xmm0
+; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0
+; SSE4-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE4-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967254,4294967254,4294967254,4294967254]
-; AVX2-NEXT: vpmaxud %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967253,4294967253,4294967253,4294967253]
+; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleud {{.*}}(%rip){1to4}, %xmm0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; AVX512-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: retq
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
%c = icmp ugt <4 x i32> %x, <i32 -43, i32 -43, i32 -43, i32 -43>
@@ -647,30 +591,15 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat(<4 x i32
;
; SSE4-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [43,44,45,46]
-; SSE4-NEXT: paddd %xmm0, %xmm1
-; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967253,4294967252,4294967251,4294967250]
-; SSE4-NEXT: pmaxud %xmm0, %xmm2
-; SSE4-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE4-NEXT: por %xmm1, %xmm0
+; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0
+; SSE4-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE4-NEXT: retq
;
-; AVX2-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleud {{.*}}(%rip), %xmm0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; AVX512-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat:
+; AVX: # %bb.0:
+; AVX-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
%a = add <4 x i32> %x, <i32 43, i32 44, i32 45, i32 46>
%c = icmp ugt <4 x i32> %x, <i32 -44, i32 -45, i32 -46, i32 -47>
%r = select <4 x i1> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %a
@@ -788,32 +717,30 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
;
; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [42,42]
-; SSE42-NEXT: paddq %xmm0, %xmm1
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: pxor %xmm2, %xmm0
-; SSE42-NEXT: pxor %xmm1, %xmm2
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE42-NEXT: por %xmm1, %xmm0
+; SSE42-NEXT: movdqa %xmm0, %xmm1
+; SSE42-NEXT: pxor %xmm2, %xmm1
+; SSE42-NEXT: paddq {{.*}}(%rip), %xmm0
+; SSE42-NEXT: pxor %xmm0, %xmm2
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
+; SSE42-NEXT: por %xmm0, %xmm1
+; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; AVX512-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512-NEXT: vpminuq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%a = add <2 x i64> %x, <i64 42, i64 42>
%c = icmp ugt <2 x i64> %x, %a
@@ -858,28 +785,30 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
;
; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [42,42]
-; SSE42-NEXT: paddq %xmm0, %xmm1
-; SSE42-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE42-NEXT: pcmpgtq {{.*}}(%rip), %xmm0
-; SSE42-NEXT: por %xmm1, %xmm0
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT: movdqa %xmm0, %xmm1
+; SSE42-NEXT: pxor %xmm2, %xmm1
+; SSE42-NEXT: paddq {{.*}}(%rip), %xmm0
+; SSE42-NEXT: pxor %xmm0, %xmm2
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
+; SSE42-NEXT: por %xmm0, %xmm1
+; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleuq {{.*}}(%rip), %xmm0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; AVX512-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512-NEXT: vpminuq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%a = add <2 x i64> %x, <i64 42, i64 42>
%c = icmp ugt <2 x i64> %x, <i64 -43, i64 -43>
@@ -924,19 +853,10 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x
; SSE-NEXT: paddusb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum:
+; AVX: # %bb.0:
+; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = add <16 x i8> %x, %y
%c = icmp ugt <16 x i8> %x, %a
%r = select <16 x i1> %c, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %a
@@ -1030,19 +950,10 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i
; SSE-NEXT: paddusw %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum:
+; AVX: # %bb.0:
+; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = add <8 x i16> %x, %y
%c = icmp ugt <8 x i16> %x, %a
%r = select <8 x i1> %c, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %a
@@ -1159,33 +1070,26 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i
;
; SSE4-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
; SSE4: # %bb.0:
-; SSE4-NEXT: paddd %xmm0, %xmm1
-; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pminud %xmm1, %xmm2
-; SSE4-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT: pxor %xmm0, %xmm2
-; SSE4-NEXT: por %xmm1, %xmm2
-; SSE4-NEXT: movdqa %xmm2, %xmm0
+; SSE4-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE4-NEXT: pxor %xmm1, %xmm2
+; SSE4-NEXT: pminud %xmm2, %xmm0
+; SSE4-NEXT: paddd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleud %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
-; AVX512-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512-NEXT: vmovdqa %xmm1, %xmm2
+; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2
+; AVX512-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%a = add <4 x i32> %x, %y
%c = icmp ugt <4 x i32> %x, %a
@@ -1364,8 +1268,8 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i
;
; SSE42-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; SSE42: # %bb.0:
-; SSE42-NEXT: paddq %xmm0, %xmm1
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT: paddq %xmm0, %xmm1
; SSE42-NEXT: pxor %xmm2, %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm2
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
@@ -1374,21 +1278,20 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i
;
; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
-; AVX512-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512-NEXT: vmovdqa %xmm1, %xmm2
+; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2
+; AVX512-NEXT: vpminuq %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%a = add <2 x i64> %x, %y
%c = icmp ugt <2 x i64> %x, %a
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index 45fe82feb8b5..633238f0b1ed 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -937,7 +937,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i64:
@@ -947,7 +947,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v2i64:
@@ -1012,12 +1012,12 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1028,7 +1028,7 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v4i64:
@@ -1118,12 +1118,12 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm6
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4
@@ -1131,12 +1131,12 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm4
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
@@ -1147,12 +1147,12 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2
; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm3
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i64:
More information about the llvm-branch-commits
mailing list