[llvm] [DAGCombiner] Add combine avg from shifts (PR #113909)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 29 04:15:33 PDT 2024
https://github.com/dnsampaio updated https://github.com/llvm/llvm-project/pull/113909
>From 3a7782d2f324d7b501254937a6f23b7cab67e970 Mon Sep 17 00:00:00 2001
From: Diogo Sampaio <dsampaio at kalrayinc.com>
Date: Mon, 28 Oct 2024 13:14:40 +0100
Subject: [PATCH] [DAGCombiner] Add combine avg from shifts
This teaches dagcombiner to fold:
`(asr (add nsw x, y), 1) -> (avgfloors x, y)`
`(lsr (add nuw x, y), 1) -> (avgflooru x, y)`
as well the combine them to a ceil variant:
`(avgfloors (add nsw x, y), 1) -> (avgceils x, y)`
`(avgflooru (add nuw x, y), 1) -> (avgceilu x, y)`
iff valid for the target.
Removes some of the ARM MVE patterns that are now dead code.
It adds the avg opcodes to `IsQRMVEInstruction` as to preserve the immediate splatting as before.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 76 ++++++
llvm/lib/Target/ARM/ARMISelLowering.cpp | 2 +
llvm/lib/Target/ARM/ARMInstrMVE.td | 85 +-----
llvm/test/CodeGen/AArch64/avg.ll | 241 ++++++++++++++++++
llvm/test/CodeGen/AArch64/sve-hadd.ll | 40 +++
5 files changed, 368 insertions(+), 76 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b800204d917503..546f5c6152e647 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -401,6 +401,8 @@ namespace {
SDValue PromoteExtend(SDValue Op);
bool PromoteLoad(SDValue Op);
+ SDValue foldShiftToAvg(SDNode *N);
+
SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
SDValue RHS, SDValue True, SDValue False,
ISD::CondCode CC);
@@ -5354,6 +5356,27 @@ SDValue DAGCombiner::visitAVG(SDNode *N) {
DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getAllOnesConstant(DL, VT)));
}
+ // Fold avgfloor((add nw x,y), 1) -> avgceil(x,y)
+ // Fold avgfloor((add nw x,1), y) -> avgceil(x,y)
+ if ((Opcode == ISD::AVGFLOORU && hasOperation(ISD::AVGCEILU, VT)) ||
+ (Opcode == ISD::AVGFLOORS && hasOperation(ISD::AVGCEILS, VT))) {
+ SDValue Add;
+ if (sd_match(N,
+ m_c_BinOp(Opcode,
+ m_AllOf(m_Value(Add), m_Add(m_Value(X), m_Value(Y))),
+ m_One())) ||
+ sd_match(N, m_c_BinOp(Opcode,
+ m_AllOf(m_Value(Add), m_Add(m_Value(X), m_One())),
+ m_Value(Y)))) {
+
+ if (IsSigned && Add->getFlags().hasNoSignedWrap())
+ return DAG.getNode(ISD::AVGCEILS, DL, VT, X, Y);
+
+ if (!IsSigned && Add->getFlags().hasNoUnsignedWrap())
+ return DAG.getNode(ISD::AVGCEILU, DL, VT, X, Y);
+ }
+ }
+
return SDValue();
}
@@ -10626,6 +10649,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
if (SDValue NarrowLoad = reduceLoadWidth(N))
return NarrowLoad;
+ if (SDValue AVG = foldShiftToAvg(N))
+ return AVG;
+
return SDValue();
}
@@ -10880,6 +10906,9 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
if (SDValue MULH = combineShiftToMULH(N, DL, DAG, TLI))
return MULH;
+ if (SDValue AVG = foldShiftToAvg(N))
+ return AVG;
+
return SDValue();
}
@@ -11393,6 +11422,53 @@ static SDValue combineMinNumMaxNumImpl(const SDLoc &DL, EVT VT, SDValue LHS,
}
}
+SDValue DAGCombiner::foldShiftToAvg(SDNode *N) {
+ const unsigned Opcode = N->getOpcode();
+
+ // Convert (sr[al] (add n[su]w x, y)) -> (avgfloor[su] x, y)
+ if (Opcode != ISD::SRA && Opcode != ISD::SRL)
+ return SDValue();
+
+ unsigned FloorISD = 0;
+ auto VT = N->getValueType(0);
+ bool IsUnsigned = false;
+
+ // Decide wether signed or unsigned.
+ switch (Opcode) {
+ case ISD::SRA:
+ if (!hasOperation(ISD::AVGFLOORS, VT))
+ return SDValue();
+ FloorISD = ISD::AVGFLOORS;
+ break;
+ case ISD::SRL:
+ IsUnsigned = true;
+ if (!hasOperation(ISD::AVGFLOORU, VT))
+ return SDValue();
+ FloorISD = ISD::AVGFLOORU;
+ break;
+ default:
+ return SDValue();
+ }
+
+ // Captured values.
+ SDValue A, B, Add;
+
+ // Match floor average as it is common to both floor/ceil avgs.
+ if (!sd_match(N, m_BinOp(Opcode,
+ m_AllOf(m_Value(Add), m_Add(m_Value(A), m_Value(B))),
+ m_One())))
+ return SDValue();
+
+ // Can't optimize adds that may wrap.
+ if (IsUnsigned && !Add->getFlags().hasNoUnsignedWrap())
+ return SDValue();
+
+ if (!IsUnsigned && !Add->getFlags().hasNoSignedWrap())
+ return SDValue();
+
+ return DAG.getNode(FloorISD, SDLoc(N), N->getValueType(0), {A, B});
+}
+
/// Generate Min/Max node
SDValue DAGCombiner::combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
SDValue RHS, SDValue True,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a98b7a8420927e..e08c0c0653eba2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -7951,6 +7951,8 @@ static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
case ISD::MUL:
case ISD::SADDSAT:
case ISD::UADDSAT:
+ case ISD::AVGFLOORS:
+ case ISD::AVGFLOORU:
return true;
case ISD::SUB:
case ISD::SSUBSAT:
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 04d5d00eef10e6..8c8403ac58b080 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2222,64 +2222,6 @@ defm MVE_VRHADDu8 : MVE_VRHADD<MVE_v16u8, avgceilu>;
defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16, avgceilu>;
defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32, avgceilu>;
-// Rounding Halving Add perform the arithemtic operation with an extra bit of
-// precision, before performing the shift, to void clipping errors. We're not
-// modelling that here with these patterns, but we're using no wrap forms of
-// add to ensure that the extra bit of information is not needed for the
-// arithmetic or the rounding.
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
- (v16i8 (ARMvmovImm (i32 3585)))),
- (i32 1))),
- (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
- (v8i16 (ARMvmovImm (i32 2049)))),
- (i32 1))),
- (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
- (v4i32 (ARMvmovImm (i32 1)))),
- (i32 1))),
- (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
- (v16i8 (ARMvmovImm (i32 3585)))),
- (i32 1))),
- (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
- (v8i16 (ARMvmovImm (i32 2049)))),
- (i32 1))),
- (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
- (v4i32 (ARMvmovImm (i32 1)))),
- (i32 1))),
- (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
-
- def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
- (v16i8 (ARMvdup (i32 1)))),
- (i32 1))),
- (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
- (v8i16 (ARMvdup (i32 1)))),
- (i32 1))),
- (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
- (v4i32 (ARMvdup (i32 1)))),
- (i32 1))),
- (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
- (v16i8 (ARMvdup (i32 1)))),
- (i32 1))),
- (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
- (v8i16 (ARMvdup (i32 1)))),
- (i32 1))),
- (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
- def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
- (v4i32 (ARMvdup (i32 1)))),
- (i32 1))),
- (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
-}
-
-
class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
bits<2> size, list<dag> pattern=[]>
: MVE_int<iname, suffix, size, pattern> {
@@ -2303,8 +2245,7 @@ class MVE_VHSUB_<string suffix, bit U, bits<2> size,
: MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>;
multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, SDNode Op,
- SDPatternOperator unpred_op, Intrinsic PredInt, PatFrag add_op,
- SDNode shift_op> {
+ SDPatternOperator unpred_op, Intrinsic PredInt> {
def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
defvar Inst = !cast<Instruction>(NAME);
defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), !cast<Instruction>(NAME)>;
@@ -2313,26 +2254,18 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, SDNode Op,
// Unpredicated add-and-divide-by-two
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))),
(VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
- def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))),
- (Inst MQPR:$Qm, MQPR:$Qn)>;
}
}
-multiclass MVE_VHADD<MVEVectorVTInfo VTI, SDNode Op, PatFrag add_op, SDNode shift_op>
- : MVE_VHADD_m<VTI, Op, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, add_op,
- shift_op>;
+multiclass MVE_VHADD<MVEVectorVTInfo VTI, SDNode Op>
+ : MVE_VHADD_m<VTI, Op, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>;
-// Halving add/sub perform the arithemtic operation with an extra bit of
-// precision, before performing the shift, to void clipping errors. We're not
-// modelling that here with these patterns, but we're using no wrap forms of
-// add/sub to ensure that the extra bit of information is not needed.
-defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8, avgfloors, addnsw, ARMvshrsImm>;
-defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16, avgfloors, addnsw, ARMvshrsImm>;
-defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32, avgfloors, addnsw, ARMvshrsImm>;
-defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8, avgflooru, addnuw, ARMvshruImm>;
-defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16, avgflooru, addnuw, ARMvshruImm>;
-defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32, avgflooru, addnuw, ARMvshruImm>;
+defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8, avgfloors>;
+defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16, avgfloors>;
+defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32, avgfloors>;
+defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8, avgflooru>;
+defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16, avgflooru>;
+defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32, avgflooru>;
multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
SDPatternOperator unpred_op, Intrinsic pred_int, PatFrag sub_op,
diff --git a/llvm/test/CodeGen/AArch64/avg.ll b/llvm/test/CodeGen/AArch64/avg.ll
index cabc0d346b806f..722d6560c9ed03 100644
--- a/llvm/test/CodeGen/AArch64/avg.ll
+++ b/llvm/test/CodeGen/AArch64/avg.ll
@@ -146,3 +146,244 @@ define <16 x i16> @sext_avgceils_mismatch(<16 x i4> %a0, <16 x i8> %a1) {
%avg = sub <16 x i16> %or, %shift
ret <16 x i16> %avg
}
+
+define <16 x i16> @add_avgflooru(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgflooru:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uhadd v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ret
+ %add = add nuw <16 x i16> %a0, %a1
+ %avg = lshr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgflooru_mismatch(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgflooru_mismatch:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: add v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ushr v1.8h, v1.8h, #1
+; CHECK-NEXT: ushr v0.8h, v0.8h, #1
+; CHECK-NEXT: ret
+ %add = add <16 x i16> %a0, %a1
+ %avg = lshr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceilu(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceilu:
+; CHECK: // %bb.0:
+; CHECK-NEXT: urhadd v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: urhadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ret
+ %add0 = add nuw <16 x i16> %a0, splat(i16 1)
+ %add = add nuw <16 x i16> %a1, %add0
+ %avg = lshr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceilu2(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceilu2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: urhadd v1.8h, v3.8h, v1.8h
+; CHECK-NEXT: urhadd v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: ret
+ %add0 = add nuw <16 x i16> %a1, %a0
+ %add = add nuw <16 x i16> %add0, splat(i16 1)
+ %avg = lshr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceilu_mismatch1(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceilu_mismatch1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v4.8h, #1
+; CHECK-NEXT: add v1.8h, v3.8h, v1.8h
+; CHECK-NEXT: add v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v4.8h
+; CHECK-NEXT: uhadd v1.8h, v1.8h, v4.8h
+; CHECK-NEXT: ret
+ %add0 = add <16 x i16> %a1, %a0
+ %add = add nuw <16 x i16> %add0, splat(i16 1)
+ %avg = lshr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceilu_mismatch2(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceilu_mismatch2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v3.16b, v3.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
+; CHECK-NEXT: sub v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ushr v0.8h, v0.8h, #1
+; CHECK-NEXT: ushr v1.8h, v1.8h, #1
+; CHECK-NEXT: ret
+ %add0 = add nuw <16 x i16> %a1, %a0
+ %add = add <16 x i16> %add0, splat(i16 1)
+ %avg = lshr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceilu_mismatch3(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceilu_mismatch3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v3.16b, v3.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
+; CHECK-NEXT: sub v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ushr v0.8h, v0.8h, #1
+; CHECK-NEXT: ushr v1.8h, v1.8h, #1
+; CHECK-NEXT: ret
+ %add0 = add nuw <16 x i16> %a1, %a0
+ %add = add <16 x i16> %add0, splat(i16 1)
+ %avg = lshr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgfloors(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgfloors:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shadd v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: shadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ret
+ %add = add nsw <16 x i16> %a0, %a1
+ %avg = ashr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgfloors_mismatch(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgfloors_mismatch:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: add v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: sshr v1.8h, v1.8h, #1
+; CHECK-NEXT: sshr v0.8h, v0.8h, #1
+; CHECK-NEXT: ret
+ %add = add <16 x i16> %a0, %a1
+ %avg = ashr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgfoor_mismatch2(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgfoor_mismatch2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: add v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: sshr v1.8h, v1.8h, #2
+; CHECK-NEXT: sshr v0.8h, v0.8h, #2
+; CHECK-NEXT: ret
+ %add = add nsw <16 x i16> %a0, %a1
+ %avg = ashr <16 x i16> %add, splat(i16 2)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceils(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceils:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v1.16b, v1.16b
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: sub v1.8h, v3.8h, v1.8h
+; CHECK-NEXT: sub v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: sshr v1.8h, v1.8h, #1
+; CHECK-NEXT: sshr v0.8h, v0.8h, #1
+; CHECK-NEXT: ret
+ %add0 = add nsw <16 x i16> %a0, splat(i16 1)
+ %add = add nsw <16 x i16> %a1, %add0
+ %avg = ashr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceils2(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceils2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: srhadd v1.8h, v3.8h, v1.8h
+; CHECK-NEXT: srhadd v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: ret
+ %add0 = add nsw <16 x i16> %a1, %a0
+ %add = add nsw <16 x i16> %add0, splat(i16 1)
+ %avg = ashr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceils_mismatch1(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceils_mismatch1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v4.8h, #1
+; CHECK-NEXT: add v1.8h, v3.8h, v1.8h
+; CHECK-NEXT: add v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: shadd v0.8h, v0.8h, v4.8h
+; CHECK-NEXT: shadd v1.8h, v1.8h, v4.8h
+; CHECK-NEXT: ret
+ %add0 = add <16 x i16> %a1, %a0
+ %add = add nsw <16 x i16> %add0, splat(i16 1)
+ %avg = ashr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceils_mismatch2(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceils_mismatch2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v3.16b, v3.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
+; CHECK-NEXT: sub v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: sshr v0.8h, v0.8h, #1
+; CHECK-NEXT: sshr v1.8h, v1.8h, #1
+; CHECK-NEXT: ret
+ %add0 = add nsw <16 x i16> %a1, %a0
+ %add = add <16 x i16> %add0, splat(i16 1)
+ %avg = ashr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceils_mismatch3(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceils_mismatch3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v3.16b, v3.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
+; CHECK-NEXT: sub v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: sshr v0.8h, v0.8h, #1
+; CHECK-NEXT: sshr v1.8h, v1.8h, #1
+; CHECK-NEXT: ret
+ %add0 = add nsw <16 x i16> %a1, %a0
+ %add = add <16 x i16> %add0, splat(i16 1)
+ %avg = ashr <16 x i16> %add, splat(i16 1)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceils_mismatch4(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceils_mismatch4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v1.16b, v1.16b
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: sub v1.8h, v3.8h, v1.8h
+; CHECK-NEXT: sub v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: sshr v1.8h, v1.8h, #2
+; CHECK-NEXT: sshr v0.8h, v0.8h, #2
+; CHECK-NEXT: ret
+ %add0 = add nsw <16 x i16> %a0, splat(i16 1)
+ %add = add nsw <16 x i16> %a1, %add0
+ %avg = ashr <16 x i16> %add, splat(i16 2)
+ ret <16 x i16> %avg
+}
+
+define <16 x i16> @add_avgceilu_mismatch(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: add_avgceilu_mismatch:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v4.8h, #1
+; CHECK-NEXT: add v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: add v1.8h, v3.8h, v1.8h
+; CHECK-NEXT: add v1.8h, v1.8h, v4.8h
+; CHECK-NEXT: add v0.8h, v0.8h, v4.8h
+; CHECK-NEXT: ushr v0.8h, v0.8h, #2
+; CHECK-NEXT: ushr v1.8h, v1.8h, #2
+; CHECK-NEXT: ret
+ %add0 = add nuw <16 x i16> %a1, %a0
+ %add = add nuw <16 x i16> %add0, splat(i16 1)
+ %avg = lshr <16 x i16> %add, splat(i16 2)
+ ret <16 x i16> %avg
+}
+
diff --git a/llvm/test/CodeGen/AArch64/sve-hadd.ll b/llvm/test/CodeGen/AArch64/sve-hadd.ll
index 6017e13ce00352..ce440d3095d3f3 100644
--- a/llvm/test/CodeGen/AArch64/sve-hadd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-hadd.ll
@@ -1301,3 +1301,43 @@ entry:
%result = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
ret <vscale x 16 x i8> %result
}
+
+define <vscale x 2 x i64> @haddu_v2i64_add(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; SVE-LABEL: haddu_v2i64_add:
+; SVE: // %bb.0: // %entry
+; SVE-NEXT: eor z2.d, z0.d, z1.d
+; SVE-NEXT: and z0.d, z0.d, z1.d
+; SVE-NEXT: lsr z1.d, z2.d, #1
+; SVE-NEXT: add z0.d, z0.d, z1.d
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: haddu_v2i64_add:
+; SVE2: // %bb.0: // %entry
+; SVE2-NEXT: ptrue p0.d
+; SVE2-NEXT: uhadd z0.d, p0/m, z0.d, z1.d
+; SVE2-NEXT: ret
+entry:
+ %add = add nuw nsw <vscale x 2 x i64> %s0, %s1
+ %avg = lshr <vscale x 2 x i64> %add, splat (i64 1)
+ ret <vscale x 2 x i64> %avg
+}
+
+define <vscale x 2 x i64> @hadds_v2i64_add(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; SVE-LABEL: hadds_v2i64_add:
+; SVE: // %bb.0: // %entry
+; SVE-NEXT: eor z2.d, z0.d, z1.d
+; SVE-NEXT: and z0.d, z0.d, z1.d
+; SVE-NEXT: asr z1.d, z2.d, #1
+; SVE-NEXT: add z0.d, z0.d, z1.d
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: hadds_v2i64_add:
+; SVE2: // %bb.0: // %entry
+; SVE2-NEXT: ptrue p0.d
+; SVE2-NEXT: shadd z0.d, p0/m, z0.d, z1.d
+; SVE2-NEXT: ret
+entry:
+ %add = add nuw nsw <vscale x 2 x i64> %s0, %s1
+ %avg = ashr <vscale x 2 x i64> %add, splat (i64 1)
+ ret <vscale x 2 x i64> %avg
+}
More information about the llvm-commits
mailing list