[llvm] [IR][LangRef] Add partial reduction add intrinsic (PR #94499)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 12 08:11:34 PDT 2024
================
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-vector-interleave=1 %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: addv s1, v1.4s
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: mov v0.s[0], w8
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
+ ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed_half:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: addv s1, v1.4s
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov w10, v0.s[1]
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: add w9, w9, w8
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: mov v0.s[0], w9
+; CHECK-NEXT: mov v0.s[1], w8
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
+ ret <4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: uaddv d1, p0, z1.s
+; CHECK-NEXT: ptrue p0.s, vl1
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov z0.s, p0/m, w8
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: index z2.s, #0, #1
+; CHECK-NEXT: mov z3.s, w8
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: mov w9, v0.s[1]
+; CHECK-NEXT: uaddv d1, p0, z1.s
+; CHECK-NEXT: ptrue p1.s, vl1
+; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: add w10, w10, w8
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: mov z0.s, p1/m, w10
+; CHECK-NEXT: mov z0.s, p0/m, w8
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
----------------
huntergr-arm wrote:
This is reducing into the first 4 elements of the accumulator; it doesn't work correctly with vscale.
https://github.com/llvm/llvm-project/pull/94499
More information about the llvm-commits
mailing list