[llvm] 0341029 - [X86][AVX] LowerADDSAT_SUBSAT - avoid X86ISD::BLENDV in UADDSAT/USUBSAT v8i32/v4i64 lowering
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 20 10:23:47 PST 2020
Author: Simon Pilgrim
Date: 2020-11-20T18:16:44Z
New Revision: 0341029bb414d346edcceeeabaf4c5bb3312c38c
URL: https://github.com/llvm/llvm-project/commit/0341029bb414d346edcceeeabaf4c5bb3312c38c
DIFF: https://github.com/llvm/llvm-project/commit/0341029bb414d346edcceeeabaf4c5bb3312c38c.diff
LOG: [X86][AVX] LowerADDSAT_SUBSAT - avoid X86ISD::BLENDV in UADDSAT/USUBSAT v8i32/v4i64 lowering
Use the OR(CMP,ADD) / AND(CMP,SUB) patterns like we do on SSE targets.
Enable custom lowering for v8i32/v4i64 and generalize the 128-bit lowering code for any vector size - this also lets us use the slightly cheaper codegen for icmp_ugt instead of umin/umax.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/uadd_sat_vec.ll
llvm/test/CodeGen/X86/usub_sat_vec.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3f752ddd1c63..3587e0eb294c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1313,6 +1313,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
@@ -26854,58 +26858,58 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
MVT VT = Op.getSimpleValueType();
SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
+ SDLoc DL(Op);
+
if (VT.getScalarType() == MVT::i1) {
- SDLoc dl(Op);
switch (Opcode) {
default: llvm_unreachable("Expected saturated arithmetic opcode");
case ISD::UADDSAT:
case ISD::SADDSAT:
// *addsat i1 X, Y --> X | Y
- return DAG.getNode(ISD::OR, dl, VT, X, Y);
+ return DAG.getNode(ISD::OR, DL, VT, X, Y);
case ISD::USUBSAT:
case ISD::SSUBSAT:
// *subsat i1 X, Y --> X & ~Y
- return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
+ return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT));
}
}
- if (VT.is128BitVector()) {
- // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
- *DAG.getContext(), VT);
- SDLoc DL(Op);
- if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
- // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
- SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
- SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
- // TODO: Move this to DAGCombiner?
- if (SetCCResultType == VT &&
- DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
- return DAG.getNode(ISD::OR, DL, VT, Cmp, Add);
- return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
- }
- if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
- // usubsat X, Y --> (X >u Y) ? X - Y : 0
- SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
- SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
- // TODO: Move this to DAGCombiner?
- if (SetCCResultType == VT &&
- DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
- return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
- return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
- }
- // Use default expansion.
- return SDValue();
- }
-
- if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
+ (VT.is256BitVector() && !Subtarget.hasInt256())) {
+ assert(Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX vector integer operation");
return splitVectorIntBinary(Op, DAG);
+ }
- assert(Op.getSimpleValueType().is256BitVector() &&
- Op.getSimpleValueType().isInteger() &&
- "Only handle AVX 256-bit vector integer operation");
- return splitVectorIntBinary(Op, DAG);
+ // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
+ // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
+ // TODO: Move this to DAGCombiner?
+ if (SetCCResultType == VT &&
+ DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
+ return DAG.getNode(ISD::OR, DL, VT, Cmp, Add);
+ return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
+ }
+
+ if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
+ // usubsat X, Y --> (X >u Y) ? X - Y : 0
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
+ // TODO: Move this to DAGCombiner?
+ if (SetCCResultType == VT &&
+ DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
+ return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
+ return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+ }
+
+ // Use default expansion.
+ return SDValue();
}
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index 7c8665ec086d..45fe82feb8b5 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -750,17 +750,16 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
;
; AVX1-LABEL: v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
-; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpminud %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpminud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i32:
@@ -870,26 +869,25 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
;
; AVX1-LABEL: v16i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vcmptrueps %ymm4, %ymm4, %ymm4
-; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm5
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
; AVX1-NEXT: vpminud %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpminud %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6
+; AVX1-NEXT: vpminud %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm4, %ymm3, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpminud %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpminud %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm4
+; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i32:
@@ -1010,34 +1008,27 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vxorps %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vcmptrueps %ymm5, %ymm5, %ymm5
-; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm5
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
-; AVX1-NEXT: vpxor %xmm3, %xmm6, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vxorps %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v4i64:
@@ -1120,38 +1111,32 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
;
; AVX1-LABEL: v8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm9
-; AVX1-NEXT: vxorps %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vcmptrueps %ymm7, %ymm7, %ymm10
-; AVX1-NEXT: vxorps %ymm2, %ymm10, %ymm8
-; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm7, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6
-; AVX1-NEXT: vxorps %xmm4, %xmm8, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm8, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm6
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm8
-; AVX1-NEXT: vxorps %ymm3, %ymm10, %ymm6
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vxorps %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm4
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
@@ -1159,19 +1144,15 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm5
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm7
-; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vpcmpeqd %ymm7, %ymm7, %ymm7
-; AVX2-NEXT: vpxor %ymm7, %ymm2, %ymm8
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm8, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm4
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm4
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm3
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i64:
diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll
index 5a8c889165d6..6d54503f8509 100644
--- a/llvm/test/CodeGen/X86/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll
@@ -919,13 +919,13 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm2, %xmm4
-; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -935,8 +935,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v4i64:
@@ -1022,26 +1022,26 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm4, %xmm6
-; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpsubq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6
; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm6, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm4
-; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
@@ -1051,13 +1051,13 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6
; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm5, %ymm0
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm2
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm4
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i64:
More information about the llvm-commits
mailing list