[llvm] 31373fb - [AArch64] Reassociate integer extending reductions to pairwise addition.
David Green via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 3 03:05:53 PST 2022
Author: David Green
Date: 2022-02-03T11:05:48Z
New Revision: 31373fb88a0a3464013e6ebc5773af27a0603275
URL: https://github.com/llvm/llvm-project/commit/31373fb88a0a3464013e6ebc5773af27a0603275
DIFF: https://github.com/llvm/llvm-project/commit/31373fb88a0a3464013e6ebc5773af27a0603275.diff
LOG: [AArch64] Reassociate integer extending reductions to pairwise addition.
Given an (integer) vecreduce, we know the order of the inputs does not matter.
We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x)))) into
UADDV(UADDLP(x)). This can also happen through an extra add, where we transform
UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
This makes sure the same thing happens signed cases too, which requires adding
a new SADDLP node.
Differential Revision: https://reviews.llvm.org/D118107
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/aarch64-addv.ll
llvm/test/CodeGen/AArch64/arm64-vabs.ll
llvm/test/CodeGen/AArch64/vecreduce-add.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 638bbd74c152d..c48f5c4b047f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2254,6 +2254,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::INDEX_VECTOR)
+ MAKE_CASE(AArch64ISD::SADDLP)
MAKE_CASE(AArch64ISD::UADDLP)
MAKE_CASE(AArch64ISD::CALL_RVMARKER)
MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
@@ -4378,8 +4379,11 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
}
+ case Intrinsic::aarch64_neon_saddlp:
case Intrinsic::aarch64_neon_uaddlp: {
- unsigned Opcode = AArch64ISD::UADDLP;
+ unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
+ ? AArch64ISD::UADDLP
+ : AArch64ISD::SADDLP;
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
}
case Intrinsic::aarch64_neon_sdot:
@@ -13196,6 +13200,61 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
}
+// Given an (integer) vecreduce, we know the order of the inputs does not
+// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
+// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
+// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
+static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
+ auto DetectAddExtract = [&](SDValue A) {
+ // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
+ // UADDLP(x) if found.
+ if (A.getOpcode() != ISD::ADD)
+ return SDValue();
+ EVT VT = A.getValueType();
+ SDValue Op0 = A.getOperand(0);
+ SDValue Op1 = A.getOperand(1);
+ if (Op0.getOpcode() != Op0.getOpcode() ||
+ (Op0.getOpcode() != ISD::ZERO_EXTEND &&
+ Op0.getOpcode() != ISD::SIGN_EXTEND))
+ return SDValue();
+ SDValue Ext0 = Op0.getOperand(0);
+ SDValue Ext1 = Op1.getOperand(0);
+ if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ Ext0.getOperand(0) != Ext1.getOperand(0))
+ return SDValue();
+ // Check that the type is twice the add types, and the extract are from
+ // upper/lower parts of the same source.
+ if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
+ VT.getVectorNumElements() * 2)
+ return SDValue();
+ if ((Ext0.getConstantOperandVal(1) != 0 &&
+ Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
+ (Ext1.getConstantOperandVal(1) != 0 &&
+ Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
+ return SDValue();
+ unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
+ : AArch64ISD::SADDLP;
+ return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
+ };
+
+ SDValue A = N->getOperand(0);
+ if (SDValue R = DetectAddExtract(A))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
+ if (A.getOpcode() == ISD::ADD) {
+ if (SDValue R = DetectAddExtract(A.getOperand(0)))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+ DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
+ A.getOperand(1)));
+ if (SDValue R = DetectAddExtract(A.getOperand(1)))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+ DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
+ A.getOperand(0)));
+ }
+ return SDValue();
+}
+
+
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -14722,7 +14781,7 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
}
// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
-static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
// Only scalar integer and vector types.
if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
@@ -14838,7 +14897,7 @@ static SDValue performAddSubCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// Try to change sum of two reductions.
- if (SDValue Val = performUADDVCombine(N, DAG))
+ if (SDValue Val = performAddUADDVCombine(N, DAG))
return Val;
if (SDValue Val = performAddDotCombine(N, DAG))
return Val;
@@ -17805,6 +17864,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performExtractVectorEltCombine(N, DAG);
case ISD::VECREDUCE_ADD:
return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
+ case AArch64ISD::UADDV:
+ return performUADDVCombine(N, DAG);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 03d00302c56ba..db771070e71d3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -240,7 +240,8 @@ enum NodeType : unsigned {
SRHADD,
URHADD,
- // Unsigned Add Long Pairwise
+ // Add Long Pairwise
+ SADDLP,
UADDLP,
// udot/sdot instructions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 83bf89ff97c50..8dc91d6c1be2d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -643,9 +643,13 @@ def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),
(int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>;
+def AArch64saddlp_n : SDNode<"AArch64ISD::SADDLP", SDT_AArch64uaddlp>;
def AArch64uaddlp : PatFrags<(ops node:$src),
[(AArch64uaddlp_n node:$src),
(int_aarch64_neon_uaddlp node:$src)]>;
+def AArch64saddlp : PatFrags<(ops node:$src),
+ [(AArch64saddlp_n node:$src),
+ (int_aarch64_neon_saddlp node:$src)]>;
def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@@ -4312,8 +4316,8 @@ defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
- BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
-defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
+ BinOpFrag<(add node:$LHS, (AArch64saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", AArch64saddlp>;
defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
defm SHLL : SIMDVectorLShiftLongBySizeBHS;
defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 32d7bebdb292b..36b418b0cee1e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -65,9 +65,7 @@ define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: uabdl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ushll v1.4s, v0.4h, #0
-; CHECK-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: uaddlv s0, v0.8h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index d1f1a399b872a..da48edc98d067 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -198,9 +198,7 @@ define i16 @uabd16b_rdx(<16 x i8>* %a, <16 x i8>* %b) {
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: uabd.16b v0, v0, v1
-; CHECK-NEXT: ushll.8h v1, v0, #0
-; CHECK-NEXT: uaddw2.8h v0, v1, v0
-; CHECK-NEXT: addv.8h h0, v0
+; CHECK-NEXT: uaddlv.16b h0, v0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%aload = load <16 x i8>, <16 x i8>* %a, align 1
@@ -261,9 +259,7 @@ define i32 @uabd8h_rdx(<8 x i16>* %a, <8 x i16>* %b) {
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: uabd.8h v0, v0, v1
-; CHECK-NEXT: ushll.4s v1, v0, #0
-; CHECK-NEXT: uaddw2.4s v0, v1, v0
-; CHECK-NEXT: addv.4s s0, v0
+; CHECK-NEXT: uaddlv.8h s0, v0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%aload = load <8 x i16>, <8 x i16>* %a, align 1
@@ -282,9 +278,7 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: sabd8h_rdx:
; CHECK: // %bb.0:
; CHECK-NEXT: sabd.8h v0, v0, v1
-; CHECK-NEXT: ushll.4s v1, v0, #0
-; CHECK-NEXT: uaddw2.4s v0, v1, v0
-; CHECK-NEXT: addv.4s s0, v0
+; CHECK-NEXT: uaddlv.8h s0, v0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%aext = sext <8 x i16> %a to <8 x i32>
@@ -338,9 +332,7 @@ define i64 @uabd4s_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: uabd.4s v0, v0, v1
-; CHECK-NEXT: ushll.2d v1, v0, #0
-; CHECK-NEXT: uaddw2.2d v0, v1, v0
-; CHECK-NEXT: addp.2d d0, v0
+; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
%aload = load <4 x i32>, <4 x i32>* %a, align 1
@@ -359,9 +351,7 @@ define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: sabd4s_rdx:
; CHECK: // %bb.0:
; CHECK-NEXT: sabd.4s v0, v0, v1
-; CHECK-NEXT: ushll.2d v1, v0, #0
-; CHECK-NEXT: uaddw2.2d v0, v1, v0
-; CHECK-NEXT: addp.2d d0, v0
+; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
%aext = sext <4 x i32> %a to <4 x i64>
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 06077071468e2..273f2e02baa55 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -16,9 +16,7 @@ entry:
define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i64_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v1.2d, v0.2s, #0
-; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
@@ -30,8 +28,7 @@ entry:
define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i64_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v1.2d, v0.2s, #0
-; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT: saddlp v0.2d, v0.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -70,9 +67,7 @@ entry:
define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i32_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v1.4s, v0.4h, #0
-; CHECK-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: uaddlv s0, v0.8h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
@@ -84,8 +79,7 @@ entry:
define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i32_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-NEXT: saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-NEXT: saddlp v0.4s, v0.8h
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
@@ -170,9 +164,7 @@ define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i64_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.2d, v0.2s, #0
-; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
@@ -185,8 +177,7 @@ define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: sshll v1.2d, v0.2s, #0
-; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT: saddlp v0.2d, v0.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -282,9 +273,7 @@ define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
; CHECK-BASE-LABEL: add_v8i8_v8i32_zext:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT: ushll v1.4s, v0.4h, #0
-; CHECK-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-BASE-NEXT: addv s0, v0.4s
+; CHECK-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
@@ -306,8 +295,7 @@ define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
; CHECK-BASE-LABEL: add_v8i8_v8i32_sext:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
@@ -358,8 +346,7 @@ entry:
define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i16_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v1.8h, v0.8b, #0
-; CHECK-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
+; CHECK-NEXT: uaddlp v0.8h, v0.16b
; CHECK-NEXT: addv h0, v0.8h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
@@ -372,8 +359,7 @@ entry:
define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i16_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v1.8h, v0.8b, #0
-; CHECK-NEXT: saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-NEXT: saddlp v0.8h, v0.16b
; CHECK-NEXT: addv h0, v0.8h
; CHECK-NEXT: smov w0, v0.h[0]
; CHECK-NEXT: ret
@@ -511,9 +497,7 @@ define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: bic v0.4h, #255, lsl #8
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.2d, v0.2s, #0
-; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
@@ -598,9 +582,7 @@ entry:
define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v1.2d, v0.2s, #0
-; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
@@ -614,8 +596,7 @@ entry:
define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v1.2d, v0.2s, #0
-; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT: saddlp v0.2d, v0.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
@@ -660,9 +641,7 @@ entry:
define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v1.4s, v0.4h, #0
-; CHECK-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: uaddlv s0, v0.8h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
@@ -676,8 +655,7 @@ entry:
define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-NEXT: saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-NEXT: saddlp v0.4s, v0.8h
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
@@ -775,9 +753,7 @@ define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.2d, v0.2s, #0
-; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
@@ -792,8 +768,7 @@ define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: sshll v1.2d, v0.2s, #0
-; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT: saddlp v0.2d, v0.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
@@ -901,9 +876,7 @@ define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT: ushll v1.4s, v0.4h, #0
-; CHECK-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-BASE-NEXT: addv s0, v0.4s
+; CHECK-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-BASE-NEXT: fmov w8, s0
; CHECK-BASE-NEXT: add w0, w8, w0
; CHECK-BASE-NEXT: ret
@@ -928,8 +901,7 @@ define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w8, s0
; CHECK-BASE-NEXT: add w0, w8, w0
@@ -987,9 +959,7 @@ entry:
define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v1.8h, v0.8b, #0
-; CHECK-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
-; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: uaddlv h0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w8, w8, w0
; CHECK-NEXT: and w0, w8, #0xffff
@@ -1004,8 +974,7 @@ entry:
define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v1.8h, v0.8b, #0
-; CHECK-NEXT: saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-NEXT: saddlp v0.8h, v0.16b
; CHECK-NEXT: addv h0, v0.8h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w8, w8, w0
@@ -1163,9 +1132,7 @@ define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: bic v0.4h, #255, lsl #8
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.2d, v0.2s, #0
-; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
@@ -1261,11 +1228,8 @@ entry:
define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v2.2d, v0.2s, #0
-; CHECK-NEXT: ushll v3.2d, v1.2s, #0
-; CHECK-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-NEXT: uadalp v0.2d, v1.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -1281,11 +1245,8 @@ entry:
define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v2.2d, v0.2s, #0
-; CHECK-NEXT: sshll v3.2d, v1.2s, #0
-; CHECK-NEXT: saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-NEXT: saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: saddlp v0.2d, v0.4s
+; CHECK-NEXT: sadalp v0.2d, v1.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -1333,11 +1294,8 @@ entry:
define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-NEXT: ushll v3.4s, v1.4h, #0
-; CHECK-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
-; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-NEXT: uadalp v0.4s, v1.8h
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
@@ -1353,11 +1311,8 @@ entry:
define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v2.4s, v0.4h, #0
-; CHECK-NEXT: sshll v3.4s, v1.4h, #0
-; CHECK-NEXT: saddw2 v0.4s, v2.4s, v0.8h
-; CHECK-NEXT: saddw2 v1.4s, v3.4s, v1.8h
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: saddlp v0.4s, v0.8h
+; CHECK-NEXT: sadalp v0.4s, v1.8h
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
@@ -1476,11 +1431,8 @@ define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ushll v2.2d, v0.2s, #0
-; CHECK-NEXT: ushll v3.2d, v1.2s, #0
-; CHECK-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-NEXT: uadalp v0.2d, v1.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -1498,11 +1450,8 @@ define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: sshll v2.2d, v0.2s, #0
-; CHECK-NEXT: sshll v3.2d, v1.2s, #0
-; CHECK-NEXT: saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-NEXT: saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: saddlp v0.2d, v0.4s
+; CHECK-NEXT: sadalp v0.2d, v1.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -1632,11 +1581,8 @@ define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-BASE-NEXT: ushll v3.4s, v1.4h, #0
-; CHECK-BASE-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
-; CHECK-BASE-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
-; CHECK-BASE-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
@@ -1664,11 +1610,8 @@ define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT: sshll v2.4s, v0.4h, #0
-; CHECK-BASE-NEXT: sshll v3.4s, v1.4h, #0
-; CHECK-BASE-NEXT: saddw2 v0.4s, v2.4s, v0.8h
-; CHECK-BASE-NEXT: saddw2 v1.4s, v3.4s, v1.8h
-; CHECK-BASE-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h
+; CHECK-BASE-NEXT: sadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
@@ -1733,12 +1676,8 @@ entry:
define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v2.8h, v0.8b, #0
-; CHECK-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-NEXT: uaddw2 v0.8h, v2.8h, v0.16b
-; CHECK-NEXT: uaddw2 v1.8h, v3.8h, v1.16b
-; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: addv h1, v1.8h
+; CHECK-NEXT: uaddlv h0, v0.16b
+; CHECK-NEXT: uaddlv h1, v1.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: fmov w9, s1
; CHECK-NEXT: add w8, w8, w9
@@ -1756,10 +1695,8 @@ entry:
define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v2.8h, v0.8b, #0
-; CHECK-NEXT: sshll v3.8h, v1.8b, #0
-; CHECK-NEXT: saddw2 v0.8h, v2.8h, v0.16b
-; CHECK-NEXT: saddw2 v1.8h, v3.8h, v1.16b
+; CHECK-NEXT: saddlp v0.8h, v0.16b
+; CHECK-NEXT: saddlp v1.8h, v1.16b
; CHECK-NEXT: addv h0, v0.8h
; CHECK-NEXT: addv h1, v1.8h
; CHECK-NEXT: fmov w8, s0
@@ -1982,11 +1919,8 @@ define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-NEXT: bic v1.4h, #255, lsl #8
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ushll v2.2d, v0.2s, #0
-; CHECK-NEXT: ushll v3.2d, v1.2s, #0
-; CHECK-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-NEXT: uadalp v0.2d, v1.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list