[clang] 1adfa4c - [ARM, MVE] Add ACLE intrinsics for the vaddv/vaddlv family.
Simon Tatham via cfe-commits
cfe-commits at lists.llvm.org
Fri Mar 20 08:44:11 PDT 2020
Author: Simon Tatham
Date: 2020-03-20T15:42:33Z
New Revision: 1adfa4c99169733dedb67b4f7ab03d2fbb196162
URL: https://github.com/llvm/llvm-project/commit/1adfa4c99169733dedb67b4f7ab03d2fbb196162
DIFF: https://github.com/llvm/llvm-project/commit/1adfa4c99169733dedb67b4f7ab03d2fbb196162.diff
LOG: [ARM,MVE] Add ACLE intrinsics for the vaddv/vaddlv family.
Summary:
I've implemented them as target-specific IR intrinsics rather than
using `@llvm.experimental.vector.reduce.add`, on the grounds that the
'experimental' intrinsic doesn't currently have much code generation
benefit, and my replacements encapsulate the sign- or zero-extension
so that you don't expose the illegal MVE vector type (`<4 x i64>`) in
IR.
The machine instructions come in two versions: with and without an
input accumulator. My new IR intrinsics, like the 'experimental' one,
don't take an accumulator parameter: we represent that by just adding
on the input value using an ordinary i32 or i64 add. So if you write
the `vaddvaq` C-language intrinsic with an input accumulator of zero,
it can be optimised to VADDV, and conversely, if you write something
like `x += vaddvq(y)` then that can be combined into VADDVA.
Most of this is achieved in isel lowering, by converting these IR
intrinsics into the existing `ARMISD::VADDV` family of custom SDNode
types. For the difficult case (64-bit accumulators), isel lowering
already implements the optimization of folding an addition into a
VADDLV to make a VADDLVA; so once we've made a VADDLV, our job is
already done, except that I had to introduce a parallel set of ARMISD
nodes for the //predicated// forms of VADDLV.
For the simpler VADDV, we handle the predicated form by just leaving
the IR intrinsic alone and matching it in an ordinary dag pattern.
Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard
Reviewed By: dmgreen
Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D76491
Added:
clang/test/CodeGen/arm-mve-intrinsics/vaddv.c
llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll
Modified:
clang/include/clang/Basic/arm_mve.td
llvm/include/llvm/IR/IntrinsicsARM.td
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/lib/Target/ARM/ARMISelLowering.h
llvm/lib/Target/ARM/ARMInstrMVE.td
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index d32f7fd92f2c..25daae2a0a25 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -1445,6 +1445,33 @@ multiclass MVEBinaryVectorHoriz64R<dag subtract, dag exchange, string xsuffix> {
"vrmlldavha">;
}
+multiclass VADDV<bit acc, bit pred, string intbase, Type Scalar> {
+ defvar accArg = !if(acc, (args Scalar:$acc), (args));
+ defvar predArg = !if(pred, (args Predicate:$pred), (args));
+ defvar intrinsic = !if(pred,
+ IRInt<intbase # "_predicated", [Vector, Predicate]>,
+ IRInt<intbase, [Vector]>);
+ defvar intCG = !con((intrinsic $v, (unsignedflag Scalar)),
+ !if(pred, (? $pred), (?)));
+ defvar accCG = !if(acc, (add intCG, $acc), intCG);
+
+ def "": Intrinsic<Scalar, !con(accArg, (args Vector:$v), predArg), accCG>;
+}
+
+let params = T.Int in {
+defm vaddvq : VADDV<0, 0, "addv", Scalar32>;
+defm vaddvaq : VADDV<1, 0, "addv", Scalar32>;
+defm vaddvq_p : VADDV<0, 1, "addv", Scalar32>;
+defm vaddvaq_p : VADDV<1, 1, "addv", Scalar32>;
+}
+
+let params = [s32, u32] in {
+defm vaddlvq : VADDV<0, 0, "addlv", Scalar64>;
+defm vaddlvaq : VADDV<1, 0, "addlv", Scalar64>;
+defm vaddlvq_p : VADDV<0, 1, "addlv", Scalar64>;
+defm vaddlvaq_p : VADDV<1, 1, "addlv", Scalar64>;
+}
+
let params = T.Int in {
def vabavq : Intrinsic<u32, (args u32:$a, Vector:$b, Vector:$c),
(IRInt<"vabav", [Vector]> (unsignedflag Scalar), $a, $b, $c)>;
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c
new file mode 100644
index 000000000000..6bacc2775881
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c
@@ -0,0 +1,470 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+ // RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+ // RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vaddvq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[A:%.*]], i32 0)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vaddvq_s8(int8x16_t a) {
+#ifdef POLYMORPHIC
+ return vaddvq(a);
+#else /* POLYMORPHIC */
+ return vaddvq_s8(a);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[A:%.*]], i32 0)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vaddvq_s16(int16x8_t a) {
+#ifdef POLYMORPHIC
+ return vaddvq(a);
+#else /* POLYMORPHIC */
+ return vaddvq_s16(a);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[A:%.*]], i32 0)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vaddvq_s32(int32x4_t a) {
+#ifdef POLYMORPHIC
+ return vaddvq(a);
+#else /* POLYMORPHIC */
+ return vaddvq_s32(a);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[A:%.*]], i32 1)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vaddvq_u8(uint8x16_t a) {
+#ifdef POLYMORPHIC
+ return vaddvq(a);
+#else /* POLYMORPHIC */
+ return vaddvq_u8(a);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[A:%.*]], i32 1)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vaddvq_u16(uint16x8_t a) {
+#ifdef POLYMORPHIC
+ return vaddvq(a);
+#else /* POLYMORPHIC */
+ return vaddvq_u16(a);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[A:%.*]], i32 1)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vaddvq_u32(uint32x4_t a) {
+#ifdef POLYMORPHIC
+ return vaddvq(a);
+#else /* POLYMORPHIC */
+ return vaddvq_u32(a);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP1]]
+//
+int32_t test_vaddvaq_s8(int32_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+ return vaddvaq(a, b);
+#else /* POLYMORPHIC */
+ return vaddvaq_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP1]]
+//
+int32_t test_vaddvaq_s16(int32_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+ return vaddvaq(a, b);
+#else /* POLYMORPHIC */
+ return vaddvaq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP1]]
+//
+int32_t test_vaddvaq_s32(int32_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vaddvaq(a, b);
+#else /* POLYMORPHIC */
+ return vaddvaq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[B:%.*]], i32 1)
+// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP1]]
+//
+uint32_t test_vaddvaq_u8(uint32_t a, uint8x16_t b) {
+#ifdef POLYMORPHIC
+ return vaddvaq(a, b);
+#else /* POLYMORPHIC */
+ return vaddvaq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[B:%.*]], i32 1)
+// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP1]]
+//
+uint32_t test_vaddvaq_u16(uint32_t a, uint16x8_t b) {
+#ifdef POLYMORPHIC
+ return vaddvaq(a, b);
+#else /* POLYMORPHIC */
+ return vaddvaq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[B:%.*]], i32 1)
+// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP1]]
+//
+uint32_t test_vaddvaq_u32(uint32_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+ return vaddvaq(a, b);
+#else /* POLYMORPHIC */
+ return vaddvaq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 0, <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vaddvq_p_s8(int8x16_t a, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvq_p(a, p);
+#else /* POLYMORPHIC */
+ return vaddvq_p_s8(a, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 0, <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vaddvq_p_s16(int16x8_t a, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvq_p(a, p);
+#else /* POLYMORPHIC */
+ return vaddvq_p_s16(a, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vaddvq_p_s32(int32x4_t a, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvq_p(a, p);
+#else /* POLYMORPHIC */
+ return vaddvq_p_s32(a, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_p_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 1, <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vaddvq_p_u8(uint8x16_t a, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvq_p(a, p);
+#else /* POLYMORPHIC */
+ return vaddvq_p_u8(a, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_p_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 1, <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vaddvq_p_u16(uint16x8_t a, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvq_p(a, p);
+#else /* POLYMORPHIC */
+ return vaddvq_p_u16(a, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vaddvq_p_u32(uint32x4_t a, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvq_p(a, p);
+#else /* POLYMORPHIC */
+ return vaddvq_p_u32(a, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP3]]
+//
+int32_t test_vaddvaq_p_s8(int32_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvaq_p(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddvaq_p_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP3]]
+//
+int32_t test_vaddvaq_p_s16(int32_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvaq_p(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddvaq_p_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP3]]
+//
+int32_t test_vaddvaq_p_s32(int32_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvaq_p(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddvaq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_p_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP3]]
+//
+uint32_t test_vaddvaq_p_u8(uint32_t a, uint8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvaq_p(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddvaq_p_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_p_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP3]]
+//
+uint32_t test_vaddvaq_p_u16(uint32_t a, uint16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvaq_p(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddvaq_p_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddvaq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]]
+// CHECK-NEXT: ret i32 [[TMP3]]
+//
+uint32_t test_vaddvaq_p_u32(uint32_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddvaq_p(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddvaq_p_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddlvq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[A:%.*]], i32 0)
+// CHECK-NEXT: ret i64 [[TMP0]]
+//
+int64_t test_vaddlvq_s32(int32x4_t a) {
+#ifdef POLYMORPHIC
+ return vaddlvq(a);
+#else /* POLYMORPHIC */
+ return vaddlvq_s32(a);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddlvq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[A:%.*]], i32 1)
+// CHECK-NEXT: ret i64 [[TMP0]]
+//
+uint64_t test_vaddlvq_u32(uint32x4_t a) {
+#ifdef POLYMORPHIC
+ return vaddlvq(a);
+#else /* POLYMORPHIC */
+ return vaddlvq_u32(a);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddlvaq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[A:%.*]]
+// CHECK-NEXT: ret i64 [[TMP1]]
+//
+int64_t test_vaddlvaq_s32(int64_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vaddlvaq(a, b);
+#else /* POLYMORPHIC */
+ return vaddlvaq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddlvaq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[B:%.*]], i32 1)
+// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[A:%.*]]
+// CHECK-NEXT: ret i64 [[TMP1]]
+//
+uint64_t test_vaddlvaq_u32(uint64_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+ return vaddlvaq(a, b);
+#else /* POLYMORPHIC */
+ return vaddlvaq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddlvq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i64 [[TMP2]]
+//
+int64_t test_vaddlvq_p_s32(int32x4_t a, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddlvq_p(a, p);
+#else /* POLYMORPHIC */
+ return vaddlvq_p_s32(a, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddlvq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i64 [[TMP2]]
+//
+uint64_t test_vaddlvq_p_u32(uint32x4_t a, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddlvq_p(a, p);
+#else /* POLYMORPHIC */
+ return vaddlvq_p_u32(a, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddlvaq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[A:%.*]]
+// CHECK-NEXT: ret i64 [[TMP3]]
+//
+int64_t test_vaddlvaq_p_s32(int64_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddlvaq_p(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddlvaq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddlvaq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[A:%.*]]
+// CHECK-NEXT: ret i64 [[TMP3]]
+//
+uint64_t test_vaddlvaq_p_u32(uint64_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vaddlvaq_p(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddlvaq_p_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index f5e29df9186b..b41831ed1f61 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -911,6 +911,11 @@ multiclass MVE_minmaxv {
defm int_arm_mve_min: MVE_minmaxv;
defm int_arm_mve_max: MVE_minmaxv;
+defm int_arm_mve_addv: MVEPredicated<[llvm_i32_ty],
+ [llvm_anyvector_ty, llvm_i32_ty /* unsigned */]>;
+defm int_arm_mve_addlv: MVEPredicated<[llvm_i64_ty],
+ [llvm_anyvector_ty, llvm_i32_ty /* unsigned */]>;
+
// Intrinsic with a predicated and a non-predicated case. The predicated case
// has two additional parameters: inactive (the value for inactive lanes, can
// be undef) and predicate.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6d7ac5325e0b..f753ce60f149 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1669,6 +1669,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VADDLVu: return "ARMISD::VADDLVu";
case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs";
case ARMISD::VADDLVAu: return "ARMISD::VADDLVAu";
+ case ARMISD::VADDLVps: return "ARMISD::VADDLVps";
+ case ARMISD::VADDLVpu: return "ARMISD::VADDLVpu";
+ case ARMISD::VADDLVAps: return "ARMISD::VADDLVAps";
+ case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu";
case ARMISD::VMLAVs: return "ARMISD::VMLAVs";
case ARMISD::VMLAVu: return "ARMISD::VMLAVu";
case ARMISD::VMLALVs: return "ARMISD::VMLALVs";
@@ -11816,18 +11820,15 @@ static SDValue PerformADDVecReduce(SDNode *N,
return SDValue();
SDLoc dl(N);
- SDValue Lo = DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
- DCI.DAG.getConstant(0, dl, MVT::i32));
- SDValue Hi = DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
- DCI.DAG.getConstant(1, dl, MVT::i32));
- SDValue Red =
- VecRed->getNumOperands() == 1
- ? DCI.DAG.getNode(OpcodeA, dl,
- DCI.DAG.getVTList({MVT::i32, MVT::i32}), Lo, Hi,
- VecRed->getOperand(0))
- : DCI.DAG.getNode(OpcodeA, dl,
- DCI.DAG.getVTList({MVT::i32, MVT::i32}), Lo, Hi,
- VecRed->getOperand(0), VecRed->getOperand(1));
+ SmallVector<SDValue, 4> Ops;
+ Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
+ DCI.DAG.getConstant(0, dl, MVT::i32)));
+ Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
+ DCI.DAG.getConstant(1, dl, MVT::i32)));
+ for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++)
+ Ops.push_back(VecRed->getOperand(i));
+ SDValue Red = DCI.DAG.getNode(OpcodeA, dl,
+ DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops);
return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
SDValue(Red.getNode(), 1));
};
@@ -11840,6 +11841,14 @@ static SDValue PerformADDVecReduce(SDNode *N,
return M;
if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
+ return M;
if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
return M;
if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
@@ -14373,6 +14382,34 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
return SDValue();
break;
}
+
+ case Intrinsic::arm_mve_addv: {
+ // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
+ // which allow PerformADDVecReduce to turn it into VADDLV when possible.
+ bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
+ return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
+ }
+
+ case Intrinsic::arm_mve_addlv:
+ case Intrinsic::arm_mve_addlv_predicated: {
+ // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
+ // which recombines the two outputs into an i64
+ bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
+ (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
+ (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
+
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
+ if (i != 2) // skip the unsigned flag
+ Ops.push_back(N->getOperand(i));
+
+ SDLoc dl(N);
+ SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
+ val.getValue(1));
+ }
}
return SDValue();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index d95e4278e958..b7b1d3a02358 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -207,12 +207,16 @@ class VectorType;
VMULLu, // ...unsigned
// MVE reductions
- VADDVs,
- VADDVu,
- VADDLVs,
- VADDLVu,
- VADDLVAs,
- VADDLVAu,
+ VADDVs, // sign- or zero-extend the elements of a vector to i32,
+ VADDVu, // add them all together, and return an i32 of their sum
+ VADDLVs, // sign- or zero-extend elements to i64 and sum, returning
+ VADDLVu, // the low and high 32-bit halves of the sum
+ VADDLVAs, // same as VADDLV[su] but also add an input accumulator
+ VADDLVAu, // provided as low and high halves
+ VADDLVps, // same as VADDLVs but with a v4i1 predicate mask
+ VADDLVpu, // same as VADDLVu but with a v4i1 predicate mask
+ VADDLVAps, // same as VADDLVps but with a v4i1 predicate mask
+ VADDLVApu, // same as VADDLVpu but with a v4i1 predicate mask
VMLAVs,
VMLAVu,
VMLALVs,
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 26cf3004ff97..b94d071ca03d 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -607,60 +607,59 @@ class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
let Inst{0} = 0b0;
}
-multiclass MVE_VADDV_A<string suffix, bit U, bits<2> size,
- list<dag> pattern=[]> {
- def acc : MVE_VADDV<"vaddva", suffix,
+def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>;
+def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>;
+
+multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
+ def acc : MVE_VADDV<"vaddva", VTI.Suffix,
(ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src",
- 0b1, U, size, pattern>;
- def no_acc : MVE_VADDV<"vaddv", suffix,
+ 0b1, VTI.Unsigned, VTI.Size>;
+ def no_acc : MVE_VADDV<"vaddv", VTI.Suffix,
(ins MQPR:$Qm), "",
- 0b0, U, size, pattern>;
-}
+ 0b0, VTI.Unsigned, VTI.Size>;
-defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>;
-defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>;
-defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>;
-defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>;
-defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>;
-defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>;
+ defvar InstA = !cast<Instruction>(NAME # "acc");
+ defvar InstN = !cast<Instruction>(NAME # "no_acc");
-def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>;
-def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>;
+ let Predicates = [HasMVEInt] in {
+ if VTI.Unsigned then {
+ def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
+ (i32 (InstN $vec))>;
+ def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
+ (i32 (InstN $vec))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec))>;
+ def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec))>;
+ } else {
+ def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
+ (i32 (InstN $vec))>;
+ def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec))>;
+ }
-let Predicates = [HasMVEInt] in {
- def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))),
- (i32 (MVE_VADDVu32no_acc $src))>;
- def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))),
- (i32 (MVE_VADDVu16no_acc $src))>;
- def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))),
- (i32 (MVE_VADDVu8no_acc $src))>;
-
- def : Pat<(i32 (ARMVADDVs (v8i16 MQPR:$src))),
- (i32 (MVE_VADDVs16no_acc $src))>;
- def : Pat<(i32 (ARMVADDVu (v8i16 MQPR:$src))),
- (i32 (MVE_VADDVu16no_acc $src))>;
- def : Pat<(i32 (ARMVADDVs (v16i8 MQPR:$src))),
- (i32 (MVE_VADDVs8no_acc $src))>;
- def : Pat<(i32 (ARMVADDVu (v16i8 MQPR:$src))),
- (i32 (MVE_VADDVu8no_acc $src))>;
-
- def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPREven:$src2))),
- (i32 (MVE_VADDVu32acc $src2, $src1))>;
- def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))),
- (i32 (MVE_VADDVu16acc $src2, $src1))>;
- def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))),
- (i32 (MVE_VADDVu8acc $src2, $src1))>;
-
- def : Pat<(i32 (add (i32 (ARMVADDVs (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))),
- (i32 (MVE_VADDVs16acc $src2, $src1))>;
- def : Pat<(i32 (add (i32 (ARMVADDVu (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))),
- (i32 (MVE_VADDVu16acc $src2, $src1))>;
- def : Pat<(i32 (add (i32 (ARMVADDVs (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))),
- (i32 (MVE_VADDVs8acc $src2, $src1))>;
- def : Pat<(i32 (add (i32 (ARMVADDVu (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))),
- (i32 (MVE_VADDVu8acc $src2, $src1))>;
+ def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
+ (i32 VTI.Unsigned),
+ (VTI.Pred VCCR:$pred))),
+ (i32 (InstN $vec, ARMVCCThen, $pred))>;
+ def : Pat<(i32 (add (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
+ (i32 VTI.Unsigned),
+ (VTI.Pred VCCR:$pred)),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
+ }
}
+defm MVE_VADDVs8 : MVE_VADDV_A<MVE_v16s8>;
+defm MVE_VADDVs16 : MVE_VADDV_A<MVE_v8s16>;
+defm MVE_VADDVs32 : MVE_VADDV_A<MVE_v4s32>;
+defm MVE_VADDVu8 : MVE_VADDV_A<MVE_v16u8>;
+defm MVE_VADDVu16 : MVE_VADDV_A<MVE_v8u16>;
+defm MVE_VADDVu32 : MVE_VADDV_A<MVE_v4u32>;
+
class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
bit A, bit U, list<dag> pattern=[]>
: MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
@@ -681,20 +680,6 @@ class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
let Inst{0} = 0b0;
}
-multiclass MVE_VADDLV_A<string suffix, bit U, list<dag> pattern=[]> {
- def acc : MVE_VADDLV<"vaddlva", suffix,
- (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm),
- "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
- 0b1, U, pattern>;
- def no_acc : MVE_VADDLV<"vaddlv", suffix,
- (ins MQPR:$Qm), "",
- 0b0, U, pattern>;
-}
-
-
-defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>;
-defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>;
-
def SDTVecReduceL : SDTypeProfile<2, 1, [ // VADDLV
SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>
]>;
@@ -702,23 +687,49 @@ def SDTVecReduceLA : SDTypeProfile<2, 3, [ // VADDLVA
SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
SDTCisVec<4>
]>;
-def ARMVADDLVs : SDNode<"ARMISD::VADDLVs", SDTVecReduceL>;
-def ARMVADDLVu : SDNode<"ARMISD::VADDLVu", SDTVecReduceL>;
-def ARMVADDLVAs : SDNode<"ARMISD::VADDLVAs", SDTVecReduceLA>;
-def ARMVADDLVAu : SDNode<"ARMISD::VADDLVAu", SDTVecReduceLA>;
+def SDTVecReduceLP : SDTypeProfile<2, 2, [ // VADDLVp
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<2>
+]>;
+def SDTVecReduceLPA : SDTypeProfile<2, 4, [ // VADDLVAp
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
+ SDTCisVec<4>, SDTCisVec<5>
+]>;
-let Predicates = [HasMVEInt] in {
- def : Pat<(ARMVADDLVs (v4i32 MQPR:$val1)),
- (MVE_VADDLVs32no_acc (v4i32 MQPR:$val1))>;
- def : Pat<(ARMVADDLVu (v4i32 MQPR:$val1)),
- (MVE_VADDLVu32no_acc (v4i32 MQPR:$val1))>;
+multiclass MVE_VADDLV_A<MVEVectorVTInfo VTI> {
+ def acc : MVE_VADDLV<"vaddlva", VTI.Suffix,
+ (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm),
+ "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
+ 0b1, VTI.Unsigned>;
+ def no_acc : MVE_VADDLV<"vaddlv", VTI.Suffix,
+ (ins MQPR:$Qm), "",
+ 0b0, VTI.Unsigned>;
+
+ defvar InstA = !cast<Instruction>(NAME # "acc");
+ defvar InstN = !cast<Instruction>(NAME # "no_acc");
- def : Pat<(ARMVADDLVAs tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1)),
- (MVE_VADDLVs32acc tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1))>;
- def : Pat<(ARMVADDLVAu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1)),
- (MVE_VADDLVu32acc tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1))>;
+ defvar letter = VTI.SuffixLetter;
+ defvar ARMVADDLV = SDNode<"ARMISD::VADDLV" # letter, SDTVecReduceL>;
+ defvar ARMVADDLVA = SDNode<"ARMISD::VADDLVA" # letter, SDTVecReduceLA>;
+ defvar ARMVADDLVp = SDNode<"ARMISD::VADDLVp" # letter, SDTVecReduceLP>;
+ defvar ARMVADDLVAp = SDNode<"ARMISD::VADDLVAp" # letter, SDTVecReduceLPA>;
+
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(ARMVADDLV (v4i32 MQPR:$vec)),
+ (InstN (v4i32 MQPR:$vec))>;
+ def : Pat<(ARMVADDLVA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec)),
+ (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec))>;
+ def : Pat<(ARMVADDLVp (v4i32 MQPR:$vec), (VTI.Pred VCCR:$pred)),
+ (InstN (v4i32 MQPR:$vec), ARMVCCThen, (VTI.Pred VCCR:$pred))>;
+ def : Pat<(ARMVADDLVAp tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec),
+ (VTI.Pred VCCR:$pred)),
+ (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec),
+ ARMVCCThen, (VTI.Pred VCCR:$pred))>;
+ }
}
+defm MVE_VADDLVs32 : MVE_VADDLV_A<MVE_v4s32>;
+defm MVE_VADDLVu32 : MVE_VADDLV_A<MVE_v4u32>;
+
class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
bit bit_17, bit bit_7, list<dag> pattern=[]>
: MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm),
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll
new file mode 100644
index 000000000000..12db8d95a327
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll
@@ -0,0 +1,416 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vaddvq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddv.s8 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %a, i32 0)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vaddvq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddv.s16 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %a, i32 0)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vaddvq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddv.s32 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %a, i32 0)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vaddvq_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddv.u8 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %a, i32 1)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vaddvq_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddv.u16 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %a, i32 1)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vaddvq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddv.u32 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %a, i32 1)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_s8(i32 %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddvaq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddva.s8 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %b, i32 0)
+ %1 = add i32 %0, %a
+ ret i32 %1
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_s16(i32 %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddvaq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddva.s16 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %b, i32 0)
+ %1 = add i32 %0, %a
+ ret i32 %1
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_s32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddvaq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddva.s32 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %b, i32 0)
+ %1 = add i32 %0, %a
+ ret i32 %1
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_u8(i32 %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddvaq_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddva.u8 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %b, i32 1)
+ %1 = add i32 %0, %a
+ ret i32 %1
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_u16(i32 %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddvaq_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddva.u16 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %b, i32 1)
+ %1 = add i32 %0, %a
+ ret i32 %1
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_u32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddvaq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddva.u32 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %b, i32 1)
+ %1 = add i32 %0, %a
+ ret i32 %1
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_p_s8(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvt.s8 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %a, i32 0, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_p_s16(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvt.s16 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %a, i32 0, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_p_s32(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvt.s32 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %a, i32 0, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_p_u8(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvq_p_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvt.u8 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %a, i32 1, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_p_u16(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvq_p_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvt.u16 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %a, i32 1, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvq_p_u32(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvt.u32 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %a, i32 1, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_p_s8(i32 %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvaq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvat.s8 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %b, i32 0, <16 x i1> %1)
+ %3 = add i32 %2, %a
+ ret i32 %3
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_p_s16(i32 %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvaq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvat.s16 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %b, i32 0, <8 x i1> %1)
+ %3 = add i32 %2, %a
+ ret i32 %3
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvaq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvat.s32 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %b, i32 0, <4 x i1> %1)
+ %3 = add i32 %2, %a
+ ret i32 %3
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_p_u8(i32 %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvaq_p_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvat.u8 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %b, i32 1, <16 x i1> %1)
+ %3 = add i32 %2, %a
+ ret i32 %3
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_p_u16(i32 %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvaq_p_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvat.u16 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %b, i32 1, <8 x i1> %1)
+ %3 = add i32 %2, %a
+ ret i32 %3
+}
+
+define arm_aapcs_vfpcc i32 @test_vaddvaq_p_u32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddvaq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddvat.u32 r0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %b, i32 1, <4 x i1> %1)
+ %3 = add i32 %2, %a
+ ret i32 %3
+}
+
+define arm_aapcs_vfpcc i64 @test_vaddlvq_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vaddlvq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddlv.s32 r0, r1, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %a, i32 0)
+ ret i64 %0
+}
+
+define arm_aapcs_vfpcc i64 @test_vaddlvq_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vaddlvq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddlv.u32 r0, r1, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %a, i32 1)
+ ret i64 %0
+}
+
+define arm_aapcs_vfpcc i64 @test_vaddlvaq_s32(i64 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddlvaq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddlva.s32 r0, r1, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %b, i32 0)
+ %1 = add i64 %0, %a
+ ret i64 %1
+}
+
+define arm_aapcs_vfpcc i64 @test_vaddlvaq_u32(i64 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddlvaq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vaddlva.u32 r0, r1, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %b, i32 1)
+ %1 = add i64 %0, %a
+ ret i64 %1
+}
+
+define arm_aapcs_vfpcc i64 @test_vaddlvq_p_s32(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddlvq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddlvt.s32 r0, r1, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %a, i32 0, <4 x i1> %1)
+ ret i64 %2
+}
+
+define arm_aapcs_vfpcc i64 @test_vaddlvq_p_u32(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddlvq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddlvt.u32 r0, r1, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %a, i32 1, <4 x i1> %1)
+ ret i64 %2
+}
+
+define arm_aapcs_vfpcc i64 @test_vaddlvaq_p_s32(i64 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddlvaq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddlvat.s32 r0, r1, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %b, i32 0, <4 x i1> %1)
+ %3 = add i64 %2, %a
+ ret i64 %3
+}
+
+define arm_aapcs_vfpcc i64 @test_vaddlvaq_p_u32(i64 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddlvaq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddlvat.u32 r0, r1, q0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %b, i32 1, <4 x i1> %1)
+ %3 = add i64 %2, %a
+ ret i64 %3
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare i32 @llvm.arm.mve.addv.v16i8(<16 x i8>, i32)
+declare i32 @llvm.arm.mve.addv.v8i16(<8 x i16>, i32)
+declare i32 @llvm.arm.mve.addv.v4i32(<4 x i32>, i32)
+declare i64 @llvm.arm.mve.addlv.v4i32(<4 x i32>, i32)
+
+declare i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8>, i32, <16 x i1>)
+declare i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16>, i32, <8 x i1>)
+declare i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
+declare i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
More information about the cfe-commits
mailing list