[llvm] [IR][LangRef] Add partial reduction add intrinsic (PR #94499)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 10 07:59:09 PDT 2024
https://github.com/NickGuy-Arm updated https://github.com/llvm/llvm-project/pull/94499
>From f365ac737c27d3c9e5978d3c64787749d89355a7 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Wed, 5 Jun 2024 16:43:04 +0100
Subject: [PATCH 1/2] Add partial reduction add intrinsic
---
llvm/docs/LangRef.rst | 33 +++++++-
llvm/include/llvm/IR/Intrinsics.td | 6 ++
.../SelectionDAG/SelectionDAGBuilder.cpp | 21 +++++
.../CodeGen/AArch64/partial-reduce-sdot-ir.ll | 76 +++++++++++++++++++
4 files changed, 134 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 9d7ade8eb523b..95f839e35b673 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -14250,7 +14250,7 @@ Arguments:
""""""""""
The first 4 arguments are similar to ``llvm.instrprof.increment``. The indexing
is specific to callsites, meaning callsites are indexed from 0, independent from
-the indexes used by the other intrinsics (such as
+the indexes used by the other intrinsics (such as
``llvm.instrprof.increment[.step]``).
The last argument is the called value of the callsite this intrinsic precedes.
@@ -14264,7 +14264,7 @@ a buffer LLVM can use to perform counter increments (i.e. the lowering of
``llvm.instrprof.increment[.step]``. The address range following the counter
buffer, ``<num-counters>`` x ``sizeof(ptr)`` - sized, is expected to contain
pointers to contexts of functions called from this function ("subcontexts").
-LLVM does not dereference into that memory region, just calculates GEPs.
+LLVM does not dereference into that memory region, just calculates GEPs.
The lowering of ``llvm.instrprof.callsite`` consists of:
@@ -19209,6 +19209,35 @@ will be on any later loop iteration.
This intrinsic will only return 0 if the input count is also 0. A non-zero input
count will produce a non-zero result.
+'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<8 x i32> %in)
+ declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<16 x i32> %in)
+ declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 8 x i32> %in)
+ declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %in)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics do an integer
+``ADD`` reduction of subvectors within a vector, returning each scalar result as
+a lane within a vector. The return type is a vector type with an
+element-type of the vector input and a width a factor of the vector input
+(typically either half or quarter).
+
+Arguments:
+""""""""""
+
+The argument to this intrinsic must be a vector of integer values.
+
+
'``llvm.experimental.vector.histogram.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 107442623ab7b..08c516bd1cea1 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2635,6 +2635,12 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
[llvm_anyvector_ty],
[IntrNoMem]>;
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
//===----------------- Pointer Authentication Intrinsics ------------------===//
//
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ba76456b5836a..f24723a45237d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7914,6 +7914,27 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
setValue(&I, Trunc);
return;
}
+ case Intrinsic::experimental_vector_partial_reduce_add: {
+ auto DL = getCurSDLoc();
+ auto ReducedTy = EVT::getEVT(I.getType());
+ auto OpNode = getValue(I.getOperand(0));
+ auto Index = DAG.getVectorIdxConstant(0, DL);
+ auto FullTy = OpNode.getValueType();
+
+ auto ResultVector = DAG.getSplat(ReducedTy, DL, DAG.getConstant(0, DL, ReducedTy.getScalarType()));
+ unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements();
+
+ for(unsigned i = 0; i < ScaleFactor; i++) {
+ auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL);
+ auto TargetIndex = DAG.getVectorIdxConstant(i, DL);
+ auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex});
+ N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N);
+ ResultVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex});
+ }
+
+ setValue(&I, ResultVector);
+ return;
+ }
case Intrinsic::experimental_cttz_elts: {
auto DL = getCurSDLoc();
SDValue Op = getValue(I.getOperand(0));
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
new file mode 100644
index 0000000000000..6a5b3bd5ace2e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-vector-interleave=1 %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define void @partial_reduce_add(<vscale x 16 x i8> %wide.load.pre, <vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %index) #0 {
+; CHECK-LABEL: partial_reduce_add:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: index z2.s, #0, #1
+; CHECK-NEXT: mov z4.s, w8
+; CHECK-NEXT: mov w8, #2 // =0x2
+; CHECK-NEXT: ptrue p2.s, vl1
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: mov z6.s, w8
+; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z4.s
+; CHECK-NEXT: uaddv d3, p0, z0.s
+; CHECK-NEXT: mov z0.s, #0 // =0x0
+; CHECK-NEXT: uaddv d7, p0, z1.s
+; CHECK-NEXT: uaddv d4, p0, z5.s
+; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: fmov x8, d3
+; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: mov z1.s, p2/m, w8
+; CHECK-NEXT: mov w8, #3 // =0x3
+; CHECK-NEXT: cmpeq p2.s, p0/z, z2.s, z6.s
+; CHECK-NEXT: mov z5.s, w8
+; CHECK-NEXT: fmov x8, d7
+; CHECK-NEXT: uaddv d3, p0, z3.s
+; CHECK-NEXT: mov z1.s, p1/m, w8
+; CHECK-NEXT: fmov x8, d4
+; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z5.s
+; CHECK-NEXT: mov z1.s, p2/m, w8
+; CHECK-NEXT: fmov x8, d3
+; CHECK-NEXT: mov z1.s, p0/m, w8
+; CHECK-NEXT: addvl x8, x1, #1
+; CHECK-NEXT: .LBB0_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: orr z0.d, z1.d, z0.d
+; CHECK-NEXT: cbnz x8, .LBB0_1
+; CHECK-NEXT: // %bb.2: // %middle.block
+; CHECK-NEXT: ret
+entry:
+ %2 = call i64 @llvm.vscale.i64()
+ %3 = mul i64 %2, 16
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %4, %vector.body ]
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %1)
+ %4 = or <vscale x 4 x i32> %partial.reduce, %vec.phi
+ %index.next = add i64 %index, %3
+ %5 = icmp eq i64 %index.next, 0
+ br i1 %5, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %6 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %4)
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #2
+
+attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
>From 102f9e40ddb33c2b4edaa0d1604d9d3dbe582928 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Mon, 10 Jun 2024 15:44:54 +0100
Subject: [PATCH 2/2] Change partial reduction intrinsic to take the
accumulator as an operand
---
llvm/docs/LangRef.rst | 22 ++-
llvm/include/llvm/IR/Intrinsics.td | 4 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 11 +-
llvm/lib/IR/Verifier.cpp | 13 ++
.../CodeGen/AArch64/partial-reduce-sdot-ir.ll | 176 +++++++++++++-----
5 files changed, 164 insertions(+), 62 deletions(-)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 95f839e35b673..640b5062090f2 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19218,24 +19218,26 @@ This is an overloaded intrinsic.
::
- declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<8 x i32> %in)
- declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<16 x i32> %in)
- declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 8 x i32> %in)
- declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %in)
+ declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accum, <8 x i32> %in)
+ declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %accum, <16 x i32> %in)
+ declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accum, <vscale x 8 x i32> %in)
+ declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accum, <vscale x 16 x i32> %in)
Overview:
"""""""""
-The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics do an integer
-``ADD`` reduction of subvectors within a vector, returning each scalar result as
-a lane within a vector. The return type is a vector type with an
-element-type of the vector input and a width a factor of the vector input
-(typically either half or quarter).
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics perform an integer
+``ADD`` reduction of subvectors within a vector, before adding the resulting vector
+to the provided accumulator vector. The return type is a vector type that matches
+the type of the accumulator vector.
Arguments:
""""""""""
-The argument to this intrinsic must be a vector of integer values.
+The first argument is the accumulator vector, or a `zeroinitializer`. The type of
+this argument must match the return type. The second argument is the vector to reduce
+into the accumulator, the width of this vector must be a positive integer multiple of
+the accumulator vector/return type.
'``llvm.experimental.vector.histogram.*``' Intrinsic
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 08c516bd1cea1..5b3e3d2387463 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2637,8 +2637,8 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
//===-------------- Intrinsics to perform partial reduction ---------------===//
-def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [llvm_anyvector_ty],
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+ [llvm_anyvector_ty, llvm_anyvector_ty],
[IntrNoMem]>;
//===----------------- Pointer Authentication Intrinsics ------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f24723a45237d..b9f7aa80b48cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7917,22 +7917,23 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
case Intrinsic::experimental_vector_partial_reduce_add: {
auto DL = getCurSDLoc();
auto ReducedTy = EVT::getEVT(I.getType());
- auto OpNode = getValue(I.getOperand(0));
- auto Index = DAG.getVectorIdxConstant(0, DL);
+ auto OpNode = getValue(I.getOperand(1));
auto FullTy = OpNode.getValueType();
- auto ResultVector = DAG.getSplat(ReducedTy, DL, DAG.getConstant(0, DL, ReducedTy.getScalarType()));
+ auto Accumulator = getValue(I.getOperand(0));
unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements();
for(unsigned i = 0; i < ScaleFactor; i++) {
auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL);
auto TargetIndex = DAG.getVectorIdxConstant(i, DL);
+ auto ExistingValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ReducedTy.getScalarType(), {Accumulator, TargetIndex});
auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex});
N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N);
- ResultVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex});
+ N = DAG.getNode(ISD::ADD, DL, ReducedTy.getScalarType(), ExistingValue, N);
+ Accumulator = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {Accumulator, N, TargetIndex});
}
- setValue(&I, ResultVector);
+ setValue(&I, Accumulator);
return;
}
case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 684e54444621b..21371bdd4bf6d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6131,6 +6131,19 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
}
break;
}
+ case Intrinsic::experimental_vector_partial_reduce_add: {
+ VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+ VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
+
+ auto VecWidth = VecTy->getElementCount().getKnownMinValue();
+ auto AccWidth = AccTy->getElementCount().getKnownMinValue();
+
+ Check((VecWidth % AccWidth) == 0, "Invalid vector widths for partial "
+ "reduction. The width of the input vector "
+ "must be a postive integer multiple of "
+ "the width of the accumulator vector.");
+ break;
+ }
case Intrinsic::experimental_noalias_scope_decl: {
NoAliasScopeDecls.push_back(cast<IntrinsicInst>(&Call));
break;
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
index 6a5b3bd5ace2e..ccdcd1b740a27 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
@@ -4,72 +4,158 @@
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
-define void @partial_reduce_add(<vscale x 16 x i8> %wide.load.pre, <vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %index) #0 {
+define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: addv s1, v1.4s
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: mov v0.s[0], w8
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
+ ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed_half:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: addv s1, v1.4s
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov w10, v0.s[1]
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: add w9, w9, w8
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: mov v0.s[0], w9
+; CHECK-NEXT: mov v0.s[1], w8
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
+ ret <4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) #0 {
; CHECK-LABEL: partial_reduce_add:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: uaddv d1, p0, z1.s
+; CHECK-NEXT: ptrue p0.s, vl1
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov z0.s, p0/m, w8
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov w8, #1 // =0x1
; CHECK-NEXT: index z2.s, #0, #1
-; CHECK-NEXT: mov z4.s, w8
-; CHECK-NEXT: mov w8, #2 // =0x2
-; CHECK-NEXT: ptrue p2.s, vl1
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: mov z3.s, w8
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: mov w9, v0.s[1]
+; CHECK-NEXT: uaddv d1, p0, z1.s
+; CHECK-NEXT: ptrue p1.s, vl1
+; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: add w10, w10, w8
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: mov z0.s, p1/m, w10
+; CHECK-NEXT: mov z0.s, p0/m, w8
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_quart:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: fmov w10, s0
; CHECK-NEXT: mov z6.s, w8
-; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z4.s
-; CHECK-NEXT: uaddv d3, p0, z0.s
-; CHECK-NEXT: mov z0.s, #0 // =0x0
-; CHECK-NEXT: uaddv d7, p0, z1.s
-; CHECK-NEXT: uaddv d4, p0, z5.s
+; CHECK-NEXT: index z5.s, #0, #1
+; CHECK-NEXT: ptrue p2.s, vl1
+; CHECK-NEXT: uaddv d1, p0, z1.s
+; CHECK-NEXT: mov w9, v0.s[1]
+; CHECK-NEXT: uaddv d2, p0, z2.s
+; CHECK-NEXT: uaddv d3, p0, z3.s
+; CHECK-NEXT: cmpeq p1.s, p0/z, z5.s, z6.s
+; CHECK-NEXT: uaddv d4, p0, z4.s
+; CHECK-NEXT: fmov x8, d1
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: fmov x8, d3
-; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: mov w10, #2 // =0x2
; CHECK-NEXT: mov z1.s, p2/m, w8
-; CHECK-NEXT: mov w8, #3 // =0x3
-; CHECK-NEXT: cmpeq p2.s, p0/z, z2.s, z6.s
-; CHECK-NEXT: mov z5.s, w8
-; CHECK-NEXT: fmov x8, d7
-; CHECK-NEXT: uaddv d3, p0, z3.s
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: mov z6.s, w10
+; CHECK-NEXT: mov w10, v0.s[2]
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: cmpeq p2.s, p0/z, z5.s, z6.s
+; CHECK-NEXT: mov z2.s, w9
+; CHECK-NEXT: fmov x9, d3
; CHECK-NEXT: mov z1.s, p1/m, w8
-; CHECK-NEXT: fmov x8, d4
-; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z5.s
-; CHECK-NEXT: mov z1.s, p2/m, w8
-; CHECK-NEXT: fmov x8, d3
+; CHECK-NEXT: mov w8, v0.s[3]
+; CHECK-NEXT: add w9, w10, w9
+; CHECK-NEXT: cmpeq p0.s, p0/z, z5.s, z2.s
+; CHECK-NEXT: mov z1.s, p2/m, w9
+; CHECK-NEXT: fmov x9, d4
+; CHECK-NEXT: add w8, w8, w9
; CHECK-NEXT: mov z1.s, p0/m, w8
-; CHECK-NEXT: addvl x8, x1, #1
-; CHECK-NEXT: .LBB0_1: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: orr z0.d, z1.d, z0.d
-; CHECK-NEXT: cbnz x8, .LBB0_1
-; CHECK-NEXT: // %bb.2: // %middle.block
+; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
entry:
- %2 = call i64 @llvm.vscale.i64()
- %3 = mul i64 %2, 16
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %4, %vector.body ]
- %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %1)
- %4 = or <vscale x 4 x i32> %partial.reduce, %vec.phi
- %index.next = add i64 %index, %3
- %5 = icmp eq i64 %index.next, 0
- br i1 %5, label %middle.block, label %vector.body
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
+ ret <vscale x 4 x i32> %partial.reduce
+}
-middle.block: ; preds = %vector.body
- %6 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %4)
- ret void
+define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add z2.s, z2.s, z3.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: index z3.s, #0, #1
+; CHECK-NEXT: mov z4.s, w8
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: mov w9, v0.s[1]
+; CHECK-NEXT: ptrue p1.s, vl1
+; CHECK-NEXT: uaddv d2, p0, z2.s
+; CHECK-NEXT: cmpeq p0.s, p0/z, z3.s, z4.s
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: add w10, w10, w8
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: mov z0.s, p1/m, w10
+; CHECK-NEXT: mov z0.s, p0/m, w8
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
+ ret <vscale x 8 x i32> %partial.reduce
}
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare i64 @llvm.vscale.i64() #1
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>) #1
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32>) #1
+declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #2
+declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>) #2
attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
More information about the llvm-commits
mailing list