[llvm] [AArch64][SVE] Improve code quality of vector unsigned add reduction. (PR #97339)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 1 12:33:36 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Dinar Temirbulatov (dtemirbulatov)
<details>
<summary>Changes</summary>
For SVE we don't have to zero extend and sum part of the result before issuing UADDV instruction. Also this change allows to handle bigger than a legal vector type more efficiently and lower a fixed-length vector type to SVE's UADDV where appropriate.
---
Full diff: https://github.com/llvm/llvm-project/pull/97339.diff
4 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+98-2)
- (modified) llvm/test/CodeGen/AArch64/sve-doublereduct.ll (+15-26)
- (modified) llvm/test/CodeGen/AArch64/sve-int-reduce.ll (+97)
- (added) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll (+135)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 130d1e9683f7e..74c11a32a3c73 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17455,6 +17455,99 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
}
+static SDValue
+performVecReduceAddZextCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64TargetLowering &TLI) {
+ if (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ SDNode *ZEXT = N->getOperand(0).getNode();
+ EVT VecVT = ZEXT->getOperand(0).getValueType();
+ SDLoc DL(N);
+
+ SDValue VecOp = ZEXT->getOperand(0);
+ VecVT = VecOp.getValueType();
+ bool IsScalableType = VecVT.isScalableVector();
+
+ if (TLI.isTypeLegal(VecVT)) {
+ if (!IsScalableType &&
+ !TLI.useSVEForFixedLengthVectorVT(
+ VecVT,
+ /*OverrideNEON=*/Subtarget.useSVEForFixedLengthVectors(VecVT)))
+ return SDValue();
+
+ if (!IsScalableType) {
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VecVT);
+ VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+ }
+ VecVT = VecOp.getValueType();
+ EVT RdxVT = N->getValueType(0);
+ RdxVT = getPackedSVEVectorVT(RdxVT);
+ SDValue Pg = getPredicateForVector(DAG, DL, VecVT);
+ SDValue Res = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
+ DAG.getConstant(Intrinsic::aarch64_sve_uaddv, DL, MVT::i64), Pg, VecOp);
+ EVT ResVT = MVT::i64;
+ if (ResVT != N->getValueType(0))
+ Res = DAG.getAnyExtOrTrunc(Res, DL, N->getValueType(0));
+ return Res;
+ }
+
+ SmallVector<SDValue, 4> SplitVals;
+ SmallVector<SDValue, 4> PrevVals;
+ PrevVals.push_back(VecOp);
+ while (true) {
+
+ if (!VecVT.isScalableVector() &&
+ !PrevVals[0].getValueType().getVectorElementCount().isKnownEven())
+ return SDValue();
+
+ for (SDValue Vec : PrevVals) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
+ SplitVals.push_back(Lo);
+ SplitVals.push_back(Hi);
+ }
+ if (TLI.isTypeLegal(SplitVals[0].getValueType()))
+ break;
+ PrevVals.clear();
+ std::copy(SplitVals.begin(), SplitVals.end(), std::back_inserter(PrevVals));
+ SplitVals.clear();
+ }
+ SDNode *VecRed = N;
+ EVT ElemType = VecRed->getValueType(0);
+ SmallVector<SDValue, 4> Results;
+
+ if (!IsScalableType &&
+ !TLI.useSVEForFixedLengthVectorVT(
+ SplitVals[0].getValueType(),
+ /*OverrideNEON=*/Subtarget.useSVEForFixedLengthVectors(
+ SplitVals[0].getValueType())))
+ return SDValue();
+
+ for (unsigned Num = 0; Num < SplitVals.size(); ++Num) {
+ SDValue Reg = SplitVals[Num];
+ EVT RdxVT = Reg->getValueType(0);
+ SDValue Pg = getPredicateForVector(DAG, DL, RdxVT);
+ if (!IsScalableType) {
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, RdxVT);
+ Reg = convertToScalableVector(DAG, ContainerVT, Reg);
+ }
+ SDValue Res = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
+ DAG.getConstant(Intrinsic::aarch64_sve_uaddv, DL, MVT::i64), Pg, Reg);
+ if (ElemType != MVT::i64)
+ Res = DAG.getAnyExtOrTrunc(Res, DL, ElemType);
+ Results.push_back(Res);
+ }
+ SDValue ToAdd = Results[0];
+ for (unsigned I = 1; I < SplitVals.size(); ++I)
+ ToAdd = DAG.getNode(ISD::ADD, DL, ElemType, ToAdd, Results[I]);
+ return ToAdd;
+}
+
// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
@@ -25128,8 +25221,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performInsertVectorEltCombine(N, DCI);
case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI, Subtarget);
- case ISD::VECREDUCE_ADD:
- return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
+ case ISD::VECREDUCE_ADD: {
+ if (SDValue Val = performVecReduceAddCombine(N, DCI.DAG, Subtarget))
+ return Val;
+ return performVecReduceAddZextCombine(N, DCI, *this);
+ }
case AArch64ISD::UADDV:
return performUADDVCombine(N, DAG);
case AArch64ISD::SMULL:
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index 7bc31d44bb654..b289dfbec527c 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -103,17 +103,12 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: add_ext_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: uunpkhi z2.h, z0.b
-; CHECK-NEXT: uunpklo z0.h, z0.b
-; CHECK-NEXT: uunpkhi z3.h, z1.b
-; CHECK-NEXT: uunpklo z1.h, z1.b
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: add z0.h, z0.h, z2.h
-; CHECK-NEXT: add z1.h, z1.h, z3.h
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uaddv d0, p0, z0.b
+; CHECK-NEXT: uaddv d1, p0, z1.b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -126,21 +121,15 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: add_ext_v32i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: uunpklo z3.h, z1.b
-; CHECK-NEXT: uunpklo z4.h, z0.b
-; CHECK-NEXT: uunpkhi z1.h, z1.b
-; CHECK-NEXT: uunpkhi z0.h, z0.b
-; CHECK-NEXT: uunpkhi z5.h, z2.b
-; CHECK-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: add z1.h, z4.h, z3.h
-; CHECK-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NEXT: add z1.h, z2.h, z5.h
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uaddv d1, p0, z1.b
+; CHECK-NEXT: uaddv d0, p0, z0.b
+; CHECK-NEXT: uaddv d2, p0, z2.b
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index 8c1b5225b7f25..4b401731e6f80 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -188,6 +188,103 @@ define i64 @uaddv_nxv2i64(<vscale x 2 x i64> %a) {
ret i64 %res
}
+define i32 @uaddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: uaddv_nxv16i8_nxv16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uaddv d0, p0, z0.b
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %1 = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
+ ret i32 %2
+}
+
+define i64 @uaddv_nxv16i16_nxv16i64(<vscale x 16 x i16> %a) {
+; CHECK-LABEL: uaddv_nxv16i16_nxv16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uaddv d1, p0, z1.h
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: ret
+ %1 = zext <vscale x 16 x i16> %a to <vscale x 16 x i64>
+ %2 = call i64 @llvm.vector.reduce.add.nxv16i64(<vscale x 16 x i64> %1)
+ ret i64 %2
+}
+
+define i32 @uaddv_nxv16i16_nxv16i32(<vscale x 32 x i16> %a) {
+; CHECK-LABEL: uaddv_nxv16i16_nxv16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uaddv d1, p0, z1.h
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: uaddv d2, p0, z2.h
+; CHECK-NEXT: uaddv d3, p0, z3.h
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %1 = zext <vscale x 32 x i16> %a to <vscale x 32 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.nxv32i64(<vscale x 32 x i32> %1)
+ ret i32 %2
+}
+
+define i32 @saddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: saddv_nxv16i8_nxv16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sunpkhi z1.h, z0.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: sunpklo z3.s, z0.h
+; CHECK-NEXT: sunpkhi z1.s, z1.h
+; CHECK-NEXT: sunpkhi z0.s, z0.h
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: add z1.s, z3.s, z2.s
+; CHECK-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %1 = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
+ ret i32 %2
+}
+
+define i32 @uaddv_nxv32i16_nxv32i32(ptr %a) {
+; CHECK-LABEL: uaddv_nxv32i16_nxv32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: uaddv d1, p0, z1.h
+; CHECK-NEXT: uaddv d2, p0, z2.h
+; CHECK-NEXT: uaddv d3, p0, z3.h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %1 = load <vscale x 32 x i16>, ptr %a, align 16
+ %2 = zext <vscale x 32 x i16> %1 to <vscale x 32 x i32>
+ %3 = call i32 @llvm.vector.reduce.add.nxv32i32(<vscale x 32 x i32> %2)
+ ret i32 %3
+}
+
; UMINV
define i8 @umin_nxv16i8(<vscale x 16 x i8> %a) {
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
new file mode 100644
index 0000000000000..9f076bd3c8e40
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,NO_STREAMING
+; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE_128
+; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE_MIN_256
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @reduce_uadd_v16i8(<32 x i8> %a) #0 {
+; NO_STREAMING-LABEL: reduce_uadd_v16i8:
+; NO_STREAMING: // %bb.0:
+; NO_STREAMING-NEXT: ushll2 v2.8h, v1.16b, #0
+; NO_STREAMING-NEXT: ushll2 v3.8h, v0.16b, #0
+; NO_STREAMING-NEXT: ushll v1.8h, v1.8b, #0
+; NO_STREAMING-NEXT: ushll v0.8h, v0.8b, #0
+; NO_STREAMING-NEXT: uaddl2 v4.4s, v3.8h, v2.8h
+; NO_STREAMING-NEXT: uaddl v2.4s, v3.4h, v2.4h
+; NO_STREAMING-NEXT: uaddl2 v5.4s, v0.8h, v1.8h
+; NO_STREAMING-NEXT: uaddl v0.4s, v0.4h, v1.4h
+; NO_STREAMING-NEXT: add v1.4s, v5.4s, v4.4s
+; NO_STREAMING-NEXT: add v0.4s, v0.4s, v2.4s
+; NO_STREAMING-NEXT: add v0.4s, v0.4s, v1.4s
+; NO_STREAMING-NEXT: addv s0, v0.4s
+; NO_STREAMING-NEXT: fmov w0, s0
+; NO_STREAMING-NEXT: ret
+;
+; SVE_128-LABEL: reduce_uadd_v16i8:
+; SVE_128: // %bb.0:
+; SVE_128-NEXT: ptrue p0.b
+; SVE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE_128-NEXT: uaddv d1, p0, z1.b
+; SVE_128-NEXT: uaddv d0, p0, z0.b
+; SVE_128-NEXT: fmov x8, d1
+; SVE_128-NEXT: fmov x9, d0
+; SVE_128-NEXT: add w0, w9, w8
+; SVE_128-NEXT: ret
+;
+; SVE_MIN_256-LABEL: reduce_uadd_v16i8:
+; SVE_MIN_256: // %bb.0:
+; SVE_MIN_256-NEXT: ptrue p0.b, vl16
+; SVE_MIN_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE_MIN_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE_MIN_256-NEXT: splice z0.b, p0, z0.b, z1.b
+; SVE_MIN_256-NEXT: ptrue p0.b
+; SVE_MIN_256-NEXT: uaddv d0, p0, z0.b
+; SVE_MIN_256-NEXT: fmov x0, d0
+; SVE_MIN_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; SVE_MIN_256-NEXT: ret
+ %1 = zext <32 x i8> %a to <32 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1)
+ ret i32 %2
+}
+
+define i32 @reduce_sadd_v16i8(<32 x i8> %a) #0 {
+; NO_STREAMING-LABEL: reduce_sadd_v16i8:
+; NO_STREAMING: // %bb.0:
+; NO_STREAMING-NEXT: sshll2 v2.8h, v1.16b, #0
+; NO_STREAMING-NEXT: sshll2 v3.8h, v0.16b, #0
+; NO_STREAMING-NEXT: sshll v1.8h, v1.8b, #0
+; NO_STREAMING-NEXT: sshll v0.8h, v0.8b, #0
+; NO_STREAMING-NEXT: saddl2 v4.4s, v3.8h, v2.8h
+; NO_STREAMING-NEXT: saddl v2.4s, v3.4h, v2.4h
+; NO_STREAMING-NEXT: saddl2 v5.4s, v0.8h, v1.8h
+; NO_STREAMING-NEXT: saddl v0.4s, v0.4h, v1.4h
+; NO_STREAMING-NEXT: add v1.4s, v5.4s, v4.4s
+; NO_STREAMING-NEXT: add v0.4s, v0.4s, v2.4s
+; NO_STREAMING-NEXT: add v0.4s, v0.4s, v1.4s
+; NO_STREAMING-NEXT: addv s0, v0.4s
+; NO_STREAMING-NEXT: fmov w0, s0
+; NO_STREAMING-NEXT: ret
+;
+; SVE_128-LABEL: reduce_sadd_v16i8:
+; SVE_128: // %bb.0:
+; SVE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE_128-NEXT: sunpklo z2.h, z1.b
+; SVE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE_128-NEXT: sunpklo z3.h, z0.b
+; SVE_128-NEXT: ptrue p0.s
+; SVE_128-NEXT: ext z1.b, z1.b, z1.b, #8
+; SVE_128-NEXT: ext z0.b, z0.b, z0.b, #8
+; SVE_128-NEXT: sunpklo z1.h, z1.b
+; SVE_128-NEXT: sunpklo z0.h, z0.b
+; SVE_128-NEXT: sunpklo z4.s, z2.h
+; SVE_128-NEXT: ext z2.b, z2.b, z2.b, #8
+; SVE_128-NEXT: sunpklo z6.s, z3.h
+; SVE_128-NEXT: ext z3.b, z3.b, z3.b, #8
+; SVE_128-NEXT: mov z5.d, z1.d
+; SVE_128-NEXT: sunpklo z7.s, z0.h
+; SVE_128-NEXT: ext z0.b, z0.b, z0.b, #8
+; SVE_128-NEXT: sunpklo z2.s, z2.h
+; SVE_128-NEXT: sunpklo z3.s, z3.h
+; SVE_128-NEXT: add z4.s, z6.s, z4.s
+; SVE_128-NEXT: ext z5.b, z5.b, z1.b, #8
+; SVE_128-NEXT: sunpklo z1.s, z1.h
+; SVE_128-NEXT: sunpklo z0.s, z0.h
+; SVE_128-NEXT: add z2.s, z3.s, z2.s
+; SVE_128-NEXT: sunpklo z5.s, z5.h
+; SVE_128-NEXT: add z1.s, z7.s, z1.s
+; SVE_128-NEXT: add z0.s, z0.s, z5.s
+; SVE_128-NEXT: add z1.s, z4.s, z1.s
+; SVE_128-NEXT: add z0.s, z2.s, z0.s
+; SVE_128-NEXT: add z0.s, z1.s, z0.s
+; SVE_128-NEXT: uaddv d0, p0, z0.s
+; SVE_128-NEXT: fmov x0, d0
+; SVE_128-NEXT: // kill: def $w0 killed $w0 killed $x0
+; SVE_128-NEXT: ret
+;
+; SVE_MIN_256-LABEL: reduce_sadd_v16i8:
+; SVE_MIN_256: // %bb.0:
+; SVE_MIN_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE_MIN_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE_MIN_256-NEXT: sunpklo z2.h, z1.b
+; SVE_MIN_256-NEXT: sunpklo z3.h, z0.b
+; SVE_MIN_256-NEXT: ptrue p0.s, vl8
+; SVE_MIN_256-NEXT: ext z1.b, z1.b, z1.b, #8
+; SVE_MIN_256-NEXT: ext z0.b, z0.b, z0.b, #8
+; SVE_MIN_256-NEXT: sunpklo z1.h, z1.b
+; SVE_MIN_256-NEXT: sunpklo z0.h, z0.b
+; SVE_MIN_256-NEXT: add z2.h, z3.h, z2.h
+; SVE_MIN_256-NEXT: add z0.h, z0.h, z1.h
+; SVE_MIN_256-NEXT: sunpklo z1.s, z2.h
+; SVE_MIN_256-NEXT: sunpklo z0.s, z0.h
+; SVE_MIN_256-NEXT: add z0.s, z1.s, z0.s
+; SVE_MIN_256-NEXT: uaddv d0, p0, z0.s
+; SVE_MIN_256-NEXT: fmov x0, d0
+; SVE_MIN_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; SVE_MIN_256-NEXT: ret
+ %1 = sext <32 x i8> %a to <32 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1)
+ ret i32 %2
+}
+
+attributes #0 = { "target-features"="+sve" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
``````````
</details>
https://github.com/llvm/llvm-project/pull/97339
More information about the llvm-commits
mailing list