[llvm] [DAG][RISCV] Use vp_reduce_fadd/fmul when widening types for FP reductions (PR #105840)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 23 08:12:03 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag
Author: Philip Reames (preames)
<details>
<summary>Changes</summary>
This is a follow up to #<!-- -->105455 which updates the VPIntrinsic mappings for the fadd and fmul cases, and supports both ordered and unordered reductions. This allows the use a single wider operation with a restricted EVL instead of padding the vector with the neutral element.
This has all the same tradeoffs as the previous patch.
---
Full diff: https://github.com/llvm/llvm-project/pull/105840.diff
5 Files Affected:
- (modified) llvm/include/llvm/IR/VPIntrinsics.def (+7-5)
- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (+16-4)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll (+2-7)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll (-12)
- (modified) llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll (+42-58)
``````````diff
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 9333f6be5b516d..521cbc2dc278f9 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -722,13 +722,15 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fminimum, VP_REDUCE_FMINIMUM,
#error \
"The internal helper macro HELPER_REGISTER_REDUCTION_SEQ_VP is already defined!"
#endif
-#define HELPER_REGISTER_REDUCTION_SEQ_VP(VPID, VPSD, SEQ_VPSD, INTRIN) \
+#define HELPER_REGISTER_REDUCTION_SEQ_VP(VPID, VPSD, SEQ_VPSD, SDOPC, SEQ_SDOPC, INTRIN) \
BEGIN_REGISTER_VP_INTRINSIC(VPID, 2, 3) \
BEGIN_REGISTER_VP_SDNODE(VPSD, 1, VPID, 2, 3) \
VP_PROPERTY_REDUCTION(0, 1) \
+ VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) \
END_REGISTER_VP_SDNODE(VPSD) \
BEGIN_REGISTER_VP_SDNODE(SEQ_VPSD, 1, VPID, 2, 3) \
HELPER_MAP_VPID_TO_VPSD(VPID, SEQ_VPSD) \
+ VP_PROPERTY_FUNCTIONAL_SDOPC(SEQ_SDOPC) \
VP_PROPERTY_REDUCTION(0, 1) \
END_REGISTER_VP_SDNODE(SEQ_VPSD) \
VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN) \
@@ -736,13 +738,13 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fminimum, VP_REDUCE_FMINIMUM,
// llvm.vp.reduce.fadd(start,x,mask,vlen)
HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fadd, VP_REDUCE_FADD,
- VP_REDUCE_SEQ_FADD,
- vector_reduce_fadd)
+ VP_REDUCE_SEQ_FADD, VECREDUCE_FADD,
+ VECREDUCE_SEQ_FADD, vector_reduce_fadd)
// llvm.vp.reduce.fmul(start,x,mask,vlen)
HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fmul, VP_REDUCE_FMUL,
- VP_REDUCE_SEQ_FMUL,
- vector_reduce_fmul)
+ VP_REDUCE_SEQ_FMUL, VECREDUCE_FMUL,
+ VECREDUCE_SEQ_FMUL, vector_reduce_fmul)
#undef HELPER_REGISTER_REDUCTION_SEQ_VP
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5745c147e3502d..475d5806467d98 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7311,8 +7311,6 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
// Generate a vp.reduce_op if it is custom/legal for the target. This avoids
// needing to pad the source vector, because the inactive lanes can simply be
// disabled and not contribute to the result.
- // TODO: VECREDUCE_FADD, VECREDUCE_FMUL aren't currently mapped correctly,
- // and thus don't take this path.
if (auto VPOpcode = ISD::getVPForBaseOpcode(Opc);
VPOpcode && TLI.isOperationLegalOrCustom(*VPOpcode, WideVT)) {
SDValue Start = NeutralElem;
@@ -7351,6 +7349,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) {
SDValue VecOp = N->getOperand(1);
SDValue Op = GetWidenedVector(VecOp);
+ EVT VT = N->getValueType(0);
EVT OrigVT = VecOp.getValueType();
EVT WideVT = Op.getValueType();
EVT ElemVT = OrigVT.getVectorElementType();
@@ -7364,6 +7363,19 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) {
unsigned OrigElts = OrigVT.getVectorMinNumElements();
unsigned WideElts = WideVT.getVectorMinNumElements();
+ // Generate a vp.reduce_op if it is custom/legal for the target. This avoids
+ // needing to pad the source vector, because the inactive lanes can simply be
+ // disabled and not contribute to the result.
+ if (auto VPOpcode = ISD::getVPForBaseOpcode(Opc);
+ VPOpcode && TLI.isOperationLegalOrCustom(*VPOpcode, WideVT)) {
+ EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WideVT.getVectorElementCount());
+ SDValue Mask = DAG.getAllOnesConstant(dl, WideMaskVT);
+ SDValue EVL = DAG.getElementCount(dl, TLI.getVPExplicitVectorLengthTy(),
+ OrigVT.getVectorElementCount());
+ return DAG.getNode(*VPOpcode, dl, VT, {AccOp, Op, Mask, EVL}, Flags);
+ }
+
if (WideVT.isScalableVector()) {
unsigned GCD = std::gcd(OrigElts, WideElts);
EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
@@ -7372,14 +7384,14 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) {
for (unsigned Idx = OrigElts; Idx < WideElts; Idx = Idx + GCD)
Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Op, SplatNeutral,
DAG.getVectorIdxConstant(Idx, dl));
- return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags);
+ return DAG.getNode(Opc, dl, VT, AccOp, Op, Flags);
}
for (unsigned Idx = OrigElts; Idx < WideElts; Idx++)
Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
DAG.getVectorIdxConstant(Idx, dl));
- return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags);
+ return DAG.getNode(Opc, dl, VT, AccOp, Op, Flags);
}
SDValue DAGTypeLegalizer::WidenVecOp_VP_REDUCE(SDNode *N) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index fa56412e71c678..6e5ab436fc02d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -791,12 +791,7 @@ define float @reduce_fadd_16xi32_prefix5(ptr %p) {
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: vmv.s.x v10, a0
-; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 5
-; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 6
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 7
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vfredusum.vs v8, v8, v10
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
@@ -880,7 +875,7 @@ define float @reduce_fadd_4xi32_non_associative(ptr %p) {
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vslideup.vi v8, v9, 3
+; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma
; CHECK-NEXT: vfredusum.vs v8, v8, v9
; CHECK-NEXT: vfmv.f.s fa4, v8
; CHECK-NEXT: fadd.s fa0, fa4, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 26dc11aef2805b..566c9070eab512 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -98,10 +98,6 @@ define half @vreduce_fadd_v7f16(ptr %x, half %s) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 7, e16, m1, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: lui a0, 1048568
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v9, 7
; CHECK-NEXT: vfmv.s.f v9, fa0
; CHECK-NEXT: vfredusum.vs v8, v8, v9
; CHECK-NEXT: vfmv.f.s fa0, v8
@@ -470,10 +466,6 @@ define float @vreduce_fadd_v7f32(ptr %x, float %s) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: lui a0, 524288
-; CHECK-NEXT: vmv.s.x v10, a0
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 7
; CHECK-NEXT: vfmv.s.f v10, fa0
; CHECK-NEXT: vfredusum.vs v8, v8, v10
; CHECK-NEXT: vfmv.f.s fa0, v8
@@ -488,10 +480,6 @@ define float @vreduce_ord_fadd_v7f32(ptr %x, float %s) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: lui a0, 524288
-; CHECK-NEXT: vmv.s.x v10, a0
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 7
; CHECK-NEXT: vfmv.s.f v10, fa0
; CHECK-NEXT: vfredosum.vs v8, v8, v10
; CHECK-NEXT: vfmv.f.s fa0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 5b140299070b94..c2ad7e76a26c75 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -889,17 +889,12 @@ define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a1, a1, a0
; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: lui a2, 1048568
-; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a2
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v9, a1
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vfmv.s.f v9, fa0
-; CHECK-NEXT: vfredosum.vs v8, v8, v9
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vfredosum.vs v9, v8, v9
+; CHECK-NEXT: vfmv.f.s fa0, v9
; CHECK-NEXT: ret
%red = call half @llvm.vector.reduce.fadd.nxv3f16(half %s, <vscale x 3 x half> %v)
ret half %red
@@ -910,18 +905,15 @@ declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>)
define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
; CHECK-LABEL: vreduce_ord_fadd_nxv6f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 1048568
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v9, v10, a0
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vfmv.s.f v10, fa0
-; CHECK-NEXT: vfredosum.vs v8, v8, v10
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT: vfredosum.vs v10, v8, v10
+; CHECK-NEXT: vfmv.f.s fa0, v10
; CHECK-NEXT: ret
%red = call half @llvm.vector.reduce.fadd.nxv6f16(half %s, <vscale x 6 x half> %v)
ret half %red
@@ -932,22 +924,15 @@ declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>)
define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) {
; CHECK-LABEL: vreduce_ord_fadd_nxv10f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 1048568
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v12, a0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v10, v12, a0
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT: vmv.v.v v11, v12
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v11, v12, a0
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: li a1, 10
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vfmv.s.f v12, fa0
-; CHECK-NEXT: vfredosum.vs v8, v8, v12
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vfredosum.vs v12, v8, v12
+; CHECK-NEXT: vfmv.f.s fa0, v12
; CHECK-NEXT: ret
%red = call half @llvm.vector.reduce.fadd.nxv10f16(half %s, <vscale x 10 x half> %v)
ret half %red
@@ -958,13 +943,16 @@ declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>)
define half @vreduce_ord_fadd_nxv12f16(<vscale x 12 x half> %v, half %s) {
; CHECK-LABEL: vreduce_ord_fadd_nxv12f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 1048568
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v11, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: slli a1, a0, 2
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vfmv.s.f v12, fa0
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfredosum.vs v8, v8, v12
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vfredosum.vs v12, v8, v12
+; CHECK-NEXT: vfmv.f.s fa0, v12
; CHECK-NEXT: ret
%red = call half @llvm.vector.reduce.fadd.nxv12f16(half %s, <vscale x 12 x half> %v)
ret half %red
@@ -977,17 +965,14 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a1, a1, a0
; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: lui a2, 1048568
-; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a2
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v9, a1
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vfmv.s.f v9, fa0
-; CHECK-NEXT: vfredusum.vs v8, v8, v9
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: lui a1, 1048568
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vfredusum.vs v10, v8, v9
+; CHECK-NEXT: vfmv.f.s fa0, v10
; CHECK-NEXT: ret
%red = call reassoc half @llvm.vector.reduce.fadd.nxv3f16(half %s, <vscale x 3 x half> %v)
ret half %red
@@ -996,18 +981,17 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
; CHECK-LABEL: vreduce_fadd_nxv6f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 1048568
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v9, v10, a0
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
; CHECK-NEXT: vfmv.s.f v10, fa0
-; CHECK-NEXT: vfredusum.vs v8, v8, v10
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: lui a1, 1048568
+; CHECK-NEXT: vmv.s.x v11, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT: vfredusum.vs v11, v8, v10
+; CHECK-NEXT: vfmv.f.s fa0, v11
; CHECK-NEXT: ret
%red = call reassoc half @llvm.vector.reduce.fadd.nxv6f16(half %s, <vscale x 6 x half> %v)
ret half %red
``````````
</details>
https://github.com/llvm/llvm-project/pull/105840
More information about the llvm-commits
mailing list