[llvm] 6222c8f - [IR][LangRef] Add partial reduction add intrinsic (#94499)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 4 05:32:46 PDT 2024
Author: Nicholas Guy
Date: 2024-07-04T13:32:42+01:00
New Revision: 6222c8f0305de1fdc5ff39f5f1d87fcfeebfa646
URL: https://github.com/llvm/llvm-project/commit/6222c8f0305de1fdc5ff39f5f1d87fcfeebfa646
DIFF: https://github.com/llvm/llvm-project/commit/6222c8f0305de1fdc5ff39f5f1d87fcfeebfa646.diff
LOG: [IR][LangRef] Add partial reduction add intrinsic (#94499)
Adds the llvm.experimental.partial.reduce.add.* overloaded intrinsic,
this intrinsic represents add reductions that result in a narrower
vector.
Added:
llvm/test/CodeGen/AArch64/partial-reduction-add.ll
Modified:
llvm/docs/LangRef.rst
llvm/include/llvm/IR/Intrinsics.td
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/IR/Verifier.cpp
Removed:
################################################################################
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index c98332d3a24fc..b9f02d6b4b41e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19441,6 +19441,37 @@ will be on any later loop iteration.
This intrinsic will only return 0 if the input count is also 0. A non-zero input
count will produce a non-zero result.
+'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
+ declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
+ declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
+ declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the
+concatenation of the two vector operands down to the number of elements dictated
+by the result type. The result type is a vector type that matches the type of the
+first operand vector.
+
+Arguments:
+""""""""""
+
+Both arguments must be vectors of matching element types. The first argument type must
+match the result type, while the second argument type must have a vector length that is a
+positive integer multiple of the first vector/result type. The arguments must be either be
+both fixed or both scalable vectors.
+
+
'``llvm.experimental.vector.histogram.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index c7d383a5d0c0c..95dbd2854322d 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2640,6 +2640,12 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
[llvm_anyvector_ty],
[IntrNoMem]>;
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+ [llvm_anyvector_ty, llvm_anyvector_ty],
+ [IntrNoMem]>;
+
//===----------------- Pointer Authentication Intrinsics ------------------===//
//
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 8db2708d41a69..cc55d53597b65 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -104,6 +104,7 @@
#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cstddef>
+#include <deque>
#include <iterator>
#include <limits>
#include <optional>
@@ -7976,6 +7977,37 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
setValue(&I, Trunc);
return;
}
+ case Intrinsic::experimental_vector_partial_reduce_add: {
+ SDValue OpNode = getValue(I.getOperand(1));
+ EVT ReducedTy = EVT::getEVT(I.getType());
+ EVT FullTy = OpNode.getValueType();
+
+ unsigned Stride = ReducedTy.getVectorMinNumElements();
+ unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
+
+ // Collect all of the subvectors
+ std::deque<SDValue> Subvectors;
+ Subvectors.push_back(getValue(I.getOperand(0)));
+ for (unsigned i = 0; i < ScaleFactor; i++) {
+ auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl);
+ Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy,
+ {OpNode, SourceIndex}));
+ }
+
+ // Flatten the subvector tree
+ while (Subvectors.size() > 1) {
+ Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy,
+ {Subvectors[0], Subvectors[1]}));
+ Subvectors.pop_front();
+ Subvectors.pop_front();
+ }
+
+ assert(Subvectors.size() == 1 &&
+ "There should only be one subvector after tree flattening");
+
+ setValue(&I, Subvectors[0]);
+ return;
+ }
case Intrinsic::experimental_cttz_elts: {
auto DL = getCurSDLoc();
SDValue Op = getValue(I.getOperand(0));
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c98f61d555140..44982f55e17de 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6143,6 +6143,20 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
}
break;
}
+ case Intrinsic::experimental_vector_partial_reduce_add: {
+ VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+ VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
+
+ unsigned VecWidth = VecTy->getElementCount().getKnownMinValue();
+ unsigned AccWidth = AccTy->getElementCount().getKnownMinValue();
+
+ Check((VecWidth % AccWidth) == 0,
+ "Invalid vector widths for partial "
+ "reduction. The width of the input vector "
+ "must be a positive integer multiple of "
+ "the width of the accumulator vector.");
+ break;
+ }
case Intrinsic::experimental_noalias_scope_decl: {
NoAliasScopeDecls.push_back(cast<IntrinsicInst>(&Call));
break;
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
new file mode 100644
index 0000000000000..ae681ee54e687
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-vector-interleave=1 -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
+ ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed_half:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
+ ret <4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: add z0.s, z2.s, z0.s
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_quart:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: add z2.s, z2.s, z3.s
+; CHECK-NEXT: add z0.s, z4.s, z0.s
+; CHECK-NEXT: add z0.s, z2.s, z0.s
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add z0.s, z0.s, z2.s
+; CHECK-NEXT: add z1.s, z1.s, z3.s
+; CHECK-NEXT: add z0.s, z4.s, z0.s
+; CHECK-NEXT: add z1.s, z5.s, z1.s
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
+ ret <vscale x 8 x i32> %partial.reduce
+}
+
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>)
+
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>)
+
+attributes #0 = { "target-features"="+sve2" }
More information about the llvm-commits
mailing list