[clang] 99581fd - [ARM][MVE] Add vector reduction intrinsics with two vector operands
Mikhail Maltsev via cfe-commits
cfe-commits at lists.llvm.org
Fri Dec 13 05:18:02 PST 2019
Author: Mikhail Maltsev
Date: 2019-12-13T13:17:29Z
New Revision: 99581fd4c8e12f5eca38e7cfc5992508a9bfe383
URL: https://github.com/llvm/llvm-project/commit/99581fd4c8e12f5eca38e7cfc5992508a9bfe383
DIFF: https://github.com/llvm/llvm-project/commit/99581fd4c8e12f5eca38e7cfc5992508a9bfe383.diff
LOG: [ARM][MVE] Add vector reduction intrinsics with two vector operands
Summary:
This patch adds intrinsics for the following MVE instructions:
* VABAV
* VMLADAV, VMLSDAV
* VMLALDAV, VMLSLDAV
* VRMLALDAVH, VRMLSLDAVH
Each of the above 4 groups has a corresponding new LLVM IR intrinsic,
since the instructions cannot be easily represented using
general-purpose IR operations.
Reviewers: simon_tatham, ostannard, dmgreen, MarkMurrayARM
Reviewed By: MarkMurrayARM
Subscribers: merge_guards_bot, kristof.beyls, hiraditya, cfe-commits, llvm-commits
Tags: #clang, #llvm
Differential Revision: https://reviews.llvm.org/D71062
Added:
clang/test/CodeGen/arm-mve-intrinsics/vabavq.c
clang/test/CodeGen/arm-mve-intrinsics/vmldav.c
clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c
llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll
Modified:
clang/include/clang/Basic/arm_mve.td
clang/include/clang/Basic/arm_mve_defs.td
llvm/include/llvm/IR/IntrinsicsARM.td
llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
llvm/lib/Target/ARM/ARMInstrMVE.td
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 9d9c067ade1c..6a27bdb807a9 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -775,6 +775,118 @@ defm vcmulq : VectorComplexMulAngle;
defm vcmlaq : VectorComplexMLAAngle;
}
+multiclass MVEBinaryVectorHoriz32<dag subtract, dag exchange, string xsuffix> {
+ def xsuffix#"q"
+ : Intrinsic<Scalar32, (args Vector:$a, Vector:$b),
+ (IRInt<"vmldava", [Vector]>
+ (unsignedflag Scalar), subtract, exchange,
+ (zeroinit Scalar32), $a, $b)>;
+ def xsuffix#"q_p"
+ : Intrinsic<Scalar32, (args Vector:$a, Vector:$b, Predicate:$pred),
+ (IRInt<"vmldava_predicated", [Vector, Predicate]>
+ (unsignedflag Scalar), subtract, exchange,
+ (zeroinit Scalar32), $a, $b, $pred)>;
+
+ def "a"#xsuffix#"q"
+ : Intrinsic<Scalar32, (args Scalar32:$a, Vector:$b, Vector:$c),
+ (IRInt<"vmldava", [Vector]>
+ (unsignedflag Scalar), subtract, exchange,
+ $a, $b, $c)>;
+ def "a"#xsuffix#"q_p"
+ : Intrinsic<Scalar32, (args Scalar32:$a, Vector:$b, Vector:$c,
+ Predicate:$pred),
+ (IRInt<"vmldava_predicated", [Vector, Predicate]>
+ (unsignedflag Scalar), subtract, exchange,
+ $a, $b, $c, $pred)>;
+}
+
+class IntrSplit64<Type resty, dag args, dag codegen>
+ : Intrinsic<resty, args,
+ (seq (u32 (lshr $a, (u64 32))):$hi,
+ (u32 $a):$lo,
+ codegen:$pair,
+ (or (shl (u64 (xval $pair, 1)), (u64 32)),
+ (u64 (xval $pair, 0))))>;
+
+class IntrSplit64ZeroInit<Type resty, dag args, dag codegen>
+ : Intrinsic<resty, args,
+ (seq (zeroinit u32):$hi,
+ (zeroinit u32):$lo,
+ codegen:$pair,
+ (or (shl (u64 (xval $pair, 1)), (u64 32)),
+ (u64 (xval $pair, 0))))>;
+
+multiclass MVEBinaryVectorHoriz64Base<dag subtract, dag exchange,
+ string xsuffix, string irname> {
+ def xsuffix#"q"
+ : IntrSplit64ZeroInit<Scalar64, (args Vector:$a, Vector:$b),
+ (IRInt<irname, [Vector]>
+ (unsignedflag Scalar), subtract, exchange,
+ $lo, $hi, $a, $b)>;
+ def xsuffix#"q_p"
+ : IntrSplit64ZeroInit<Scalar64, (args Vector:$a, Vector:$b,
+ Predicate:$pred),
+ (IRInt<irname#"_predicated", [Vector, Predicate]>
+ (unsignedflag Scalar), subtract, exchange,
+ $lo, $hi, $a, $b, $pred)>;
+
+ def "a"#xsuffix#"q"
+ : IntrSplit64<Scalar64, (args Scalar64:$a, Vector:$b, Vector:$c),
+ (IRInt<irname, [Vector]>
+ (unsignedflag Scalar), subtract, exchange,
+ $lo, $hi, $b, $c)>;
+ def "a"#xsuffix#"q_p"
+ : IntrSplit64<Scalar64, (args Scalar64:$a, Vector:$b, Vector:$c,
+ Predicate:$pred),
+ (IRInt<irname#"_predicated", [Vector, Predicate]>
+ (unsignedflag Scalar), subtract, exchange,
+ $lo, $hi, $b, $c, $pred)>;
+}
+
+multiclass MVEBinaryVectorHoriz64<dag subtract, dag exchange, string xsuffix> {
+ defm "" : MVEBinaryVectorHoriz64Base<subtract, exchange, xsuffix, "vmlldava">;
+}
+
+multiclass MVEBinaryVectorHoriz64R<dag subtract, dag exchange, string xsuffix> {
+ defm "" : MVEBinaryVectorHoriz64Base<subtract, exchange, xsuffix,
+ "vrmlldavha">;
+}
+
+let params = T.Int in {
+def vabavq : Intrinsic<u32, (args u32:$a, Vector:$b, Vector:$c),
+ (IRInt<"vabav", [Vector]> (unsignedflag Scalar), $a, $b, $c)>;
+def vabavq_p : Intrinsic<u32, (args u32:$a, Vector:$b, Vector:$c,
+ Predicate:$pred),
+ (IRInt<"vabav_predicated", [Vector, Predicate]>
+ (unsignedflag Scalar), $a, $b, $c, $pred)>;
+
+defm vmladav : MVEBinaryVectorHoriz32<V.False, V.False, "">;
+}
+
+let params = T.Signed in {
+defm vmladav : MVEBinaryVectorHoriz32<V.False, V.True, "x">;
+defm vmlsdav : MVEBinaryVectorHoriz32<V.True, V.False, "">;
+defm vmlsdav : MVEBinaryVectorHoriz32<V.True, V.True, "x">;
+}
+
+let params = [u16, s16, u32, s32] in
+defm vmlaldav : MVEBinaryVectorHoriz64<V.False, V.False, "">;
+
+let params = [s16, s32] in {
+defm vmlaldav : MVEBinaryVectorHoriz64<V.False, V.True, "x">;
+defm vmlsldav : MVEBinaryVectorHoriz64<V.True, V.False, "">;
+defm vmlsldav : MVEBinaryVectorHoriz64<V.True, V.True, "x">;
+}
+
+let params = T.Int32 in
+defm vrmlaldavh : MVEBinaryVectorHoriz64R<V.False, V.False, "">;
+
+let params = [s32] in {
+defm vrmlaldavh : MVEBinaryVectorHoriz64R<V.False, V.True, "x">;
+defm vrmlsldavh : MVEBinaryVectorHoriz64R<V.True, V.False, "">;
+defm vrmlsldavh : MVEBinaryVectorHoriz64R<V.True, V.True, "x">;
+}
+
foreach desttype = T.All in {
// We want a vreinterpretq between every pair of supported vector types
// _except_ that there shouldn't be one from a type to itself.
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index 3e22e44607ca..03472fb47b6c 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -284,6 +284,11 @@ def UVector: VecOf<UScalar>;
// Scalar.
def DblVector: VecOf<DoubleSize<Scalar>>;
+// Expands to the 32-bit integer of the same signedness as Scalar.
+def Scalar32: CopyKind<u32, Scalar>;
+// Expands to the 64-bit integer of the same signedness as Scalar.
+def Scalar64: CopyKind<u64, Scalar>;
+
// -----------------------------------------------------------------------------
// Internal definitions for specifying immediate arguments for an intrinsic.
@@ -478,3 +483,13 @@ def T {
list<Type> All64 = Int64;
list<Type> All = Usual # All64;
}
+
+// -----------------------------------------------------------------------------
+// Container record for DAG constant values. These constants are used because
+// bit/int class/multiclass parameters cannot be used to produce a dag node:
+// for example (u32 x) where x is 0 is transformed into (u32 { 0 }) by the
+// Tablegen parser.
+def V {
+ dag False = (u32 0);
+ dag True = (u32 1);
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c
new file mode 100644
index 000000000000..e1362760bb8f
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c
@@ -0,0 +1,173 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vabavq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v16i8(i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_s8(uint32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+ return vabavq(a, b, c);
+#else
+ return vabavq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v8i16(i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_s16(uint32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+ return vabavq(a, b, c);
+#else
+ return vabavq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v4i32(i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_s32(uint32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vabavq(a, b, c);
+#else
+ return vabavq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v16i8(i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_u8(uint32_t a, uint8x16_t b, uint8x16_t c) {
+#ifdef POLYMORPHIC
+ return vabavq(a, b, c);
+#else
+ return vabavq_u8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v8i16(i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_u16(uint32_t a, uint16x8_t b, uint16x8_t c) {
+#ifdef POLYMORPHIC
+ return vabavq(a, b, c);
+#else
+ return vabavq_u16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v4i32(i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_u32(uint32_t a, uint32x4_t b, uint32x4_t c) {
+#ifdef POLYMORPHIC
+ return vabavq(a, b, c);
+#else
+ return vabavq_u32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_s8(uint32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vabavq_p(a, b, c, p);
+#else
+ return vabavq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_s16(uint32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vabavq_p(a, b, c, p);
+#else
+ return vabavq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_s32(uint32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vabavq_p(a, b, c, p);
+#else
+ return vabavq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_u8(uint32_t a, uint8x16_t b, uint8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vabavq_p(a, b, c, p);
+#else
+ return vabavq_p_u8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_u16(uint32_t a, uint16x8_t b, uint16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vabavq_p(a, b, c, p);
+#else
+ return vabavq_p_u16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_u32(uint32_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vabavq_p(a, b, c, p);
+#else
+ return vabavq_p_u32(a, b, c, p);
+#endif
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c b/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c
new file mode 100644
index 000000000000..60339ff8db56
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c
@@ -0,0 +1,845 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vmladavaq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavaq_s8(int32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+ return vmladavaq(a, b, c);
+#else
+ return vmladavaq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavaq_s16(int32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmladavaq(a, b, c);
+#else
+ return vmladavaq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavaq_s32(int32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmladavaq(a, b, c);
+#else
+ return vmladavaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vmladavaq_u8(uint32_t a, uint8x16_t b, uint8x16_t c) {
+#ifdef POLYMORPHIC
+ return vmladavaq(a, b, c);
+#else
+ return vmladavaq_u8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vmladavaq_u16(uint32_t a, uint16x8_t b, uint16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmladavaq(a, b, c);
+#else
+ return vmladavaq_u16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vmladavaq_u32(uint32_t a, uint32x4_t b, uint32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmladavaq(a, b, c);
+#else
+ return vmladavaq_u32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavaxq_s8(int32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+ return vmladavaxq(a, b, c);
+#else
+ return vmladavaxq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavaxq_s16(int32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmladavaxq(a, b, c);
+#else
+ return vmladavaxq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavaxq_s32(int32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmladavaxq(a, b, c);
+#else
+ return vmladavaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaq_s8(int32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+ return vmlsdavaq(a, b, c);
+#else
+ return vmlsdavaq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaq_s16(int32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmlsdavaq(a, b, c);
+#else
+ return vmlsdavaq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaq_s32(int32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmlsdavaq(a, b, c);
+#else
+ return vmlsdavaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaxq_s8(int32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+ return vmlsdavaxq(a, b, c);
+#else
+ return vmlsdavaxq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaxq_s16(int32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmlsdavaxq(a, b, c);
+#else
+ return vmlsdavaxq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaxq_s32(int32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmlsdavaxq(a, b, c);
+#else
+ return vmlsdavaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavaq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavaq_p(a, b, c, p);
+#else
+ return vmladavaq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavaq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavaq_p(a, b, c, p);
+#else
+ return vmladavaq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavaq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavaq_p(a, b, c, p);
+#else
+ return vmladavaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vmladavaq_p_u8(uint32_t a, uint8x16_t b, uint8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavaq_p(a, b, c, p);
+#else
+ return vmladavaq_p_u8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vmladavaq_p_u16(uint32_t a, uint16x8_t b, uint16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavaq_p(a, b, c, p);
+#else
+ return vmladavaq_p_u16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vmladavaq_p_u32(uint32_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavaq_p(a, b, c, p);
+#else
+ return vmladavaq_p_u32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavaxq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavaxq_p(a, b, c, p);
+#else
+ return vmladavaxq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavaxq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavaxq_p(a, b, c, p);
+#else
+ return vmladavaxq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavaxq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavaxq_p(a, b, c, p);
+#else
+ return vmladavaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavaq_p(a, b, c, p);
+#else
+ return vmlsdavaq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavaq_p(a, b, c, p);
+#else
+ return vmlsdavaq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavaq_p(a, b, c, p);
+#else
+ return vmlsdavaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaxq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavaxq_p(a, b, c, p);
+#else
+ return vmlsdavaxq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaxq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavaxq_p(a, b, c, p);
+#else
+ return vmlsdavaxq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaxq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavaxq_p(a, b, c, p);
+#else
+ return vmlsdavaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavq_s8(int8x16_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+ return vmladavq(a, b);
+#else
+ return vmladavq_s8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmladavq(a, b);
+#else
+ return vmladavq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmladavq(a, b);
+#else
+ return vmladavq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vmladavq_u8(uint8x16_t a, uint8x16_t b) {
+#ifdef POLYMORPHIC
+ return vmladavq(a, b);
+#else
+ return vmladavq_u8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vmladavq_u16(uint16x8_t a, uint16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmladavq(a, b);
+#else
+ return vmladavq_u16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_vmladavq_u32(uint32x4_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmladavq(a, b);
+#else
+ return vmladavq_u32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavxq_s8(int8x16_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+ return vmladavxq(a, b);
+#else
+ return vmladavxq_s8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavxq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmladavxq(a, b);
+#else
+ return vmladavxq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmladavxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmladavxq(a, b);
+#else
+ return vmladavxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavq_s8(int8x16_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+ return vmlsdavq(a, b);
+#else
+ return vmlsdavq_s8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmlsdavq(a, b);
+#else
+ return vmlsdavq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmlsdavq(a, b);
+#else
+ return vmlsdavq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavxq_s8(int8x16_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+ return vmlsdavxq(a, b);
+#else
+ return vmlsdavxq_s8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavxq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmlsdavxq(a, b);
+#else
+ return vmlsdavxq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmlsdavxq(a, b);
+#else
+ return vmlsdavxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavq_p(a, b, p);
+#else
+ return vmladavq_p_s8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavq_p(a, b, p);
+#else
+ return vmladavq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavq_p(a, b, p);
+#else
+ return vmladavq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vmladavq_p_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavq_p(a, b, p);
+#else
+ return vmladavq_p_u8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vmladavq_p_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavq_p(a, b, p);
+#else
+ return vmladavq_p_u16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+uint32_t test_vmladavq_p_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavq_p(a, b, p);
+#else
+ return vmladavq_p_u32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavxq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavxq_p(a, b, p);
+#else
+ return vmladavxq_p_s8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavxq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavxq_p(a, b, p);
+#else
+ return vmladavxq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmladavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmladavxq_p(a, b, p);
+#else
+ return vmladavxq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavq_p(a, b, p);
+#else
+ return vmlsdavq_p_s8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavq_p(a, b, p);
+#else
+ return vmlsdavq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavq_p(a, b, p);
+#else
+ return vmlsdavq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_p_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavxq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavxq_p(a, b, p);
+#else
+ return vmlsdavxq_p_s8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavxq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavxq_p(a, b, p);
+#else
+ return vmlsdavxq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsdavxq_p(a, b, p);
+#else
+ return vmlsdavxq_p_s32(a, b, p);
+#endif
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c b/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c
new file mode 100644
index 000000000000..8bc1e2531e98
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c
@@ -0,0 +1,1295 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vmlaldavaq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vmlaldavaq_s16(int64_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmlaldavaq(a, b, c);
+#else
+ return vmlaldavaq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vmlaldavaq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmlaldavaq(a, b, c);
+#else
+ return vmlaldavaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+uint64_t test_vmlaldavaq_u16(uint64_t a, uint16x8_t b, uint16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmlaldavaq(a, b, c);
+#else
+ return vmlaldavaq_u16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+uint64_t test_vmlaldavaq_u32(uint64_t a, uint32x4_t b, uint32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmlaldavaq(a, b, c);
+#else
+ return vmlaldavaq_u32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaxq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vmlaldavaxq_s16(int64_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmlaldavaxq(a, b, c);
+#else
+ return vmlaldavaxq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vmlaldavaxq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmlaldavaxq(a, b, c);
+#else
+ return vmlaldavaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vmlsldavaq_s16(int64_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmlsldavaq(a, b, c);
+#else
+ return vmlsldavaq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vmlsldavaq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmlsldavaq(a, b, c);
+#else
+ return vmlsldavaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldaxvaq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vmlsldaxvaq_s16(int64_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+ return vmlsldavaxq(a, b, c);
+#else
+ return vmlsldavaxq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vmlsldavaxq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vmlsldavaxq(a, b, c);
+#else
+ return vmlsldavaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vrmlaldavhaq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhaq(a, b, c);
+#else
+ return vrmlaldavhaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+uint64_t test_vrmlaldavhaq_u32(uint64_t a, uint32x4_t b, uint32x4_t c) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhaq(a, b, c);
+#else
+ return vrmlaldavhaq_u32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vrmlaldavhaxq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhaxq(a, b, c);
+#else
+ return vrmlaldavhaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhaq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vrmlsldavhaq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vrmlsldavhaq(a, b, c);
+#else
+ return vrmlsldavhaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhaxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+int64_t test_vrmlsldavhaxq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+ return vrmlsldavhaxq(a, b, c);
+#else
+ return vrmlsldavhaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vmlaldavaq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavaq_p(a, b, c, p);
+#else
+ return vmlaldavaq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vmlaldavaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavaq_p(a, b, c, p);
+#else
+ return vmlaldavaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_p_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+uint64_t test_vmlaldavaq_p_u16(uint64_t a, uint16x8_t b, uint16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavaq_p(a, b, c, p);
+#else
+ return vmlaldavaq_p_u16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+uint64_t test_vmlaldavaq_p_u32(uint64_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavaq_p(a, b, c, p);
+#else
+ return vmlaldavaq_p_u32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaxq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vmlaldavaxq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavaxq_p(a, b, c, p);
+#else
+ return vmlaldavaxq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vmlaldavaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavaxq_p(a, b, c, p);
+#else
+ return vmlaldavaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vmlsldavaq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsldavaq_p(a, b, c, p);
+#else
+ return vmlsldavaq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vmlsldavaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsldavaq_p(a, b, c, p);
+#else
+ return vmlsldavaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldaxvaq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vmlsldaxvaq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsldavaxq_p(a, b, c, p);
+#else
+ return vmlsldavaxq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vmlsldavaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsldavaxq_p(a, b, c, p);
+#else
+ return vmlsldavaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vrmlaldavhaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhaq_p(a, b, c, p);
+#else
+ return vrmlaldavhaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+uint64_t test_vrmlaldavhaq_p_u32(uint64_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhaq_p(a, b, c, p);
+#else
+ return vrmlaldavhaq_p_u32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vrmlaldavhaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhaxq_p(a, b, c, p);
+#else
+ return vrmlaldavhaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhaq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vrmlsldavhaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlsldavhaq_p(a, b, c, p);
+#else
+ return vrmlsldavhaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhaxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT: ret i64 [[TMP11]]
+//
+int64_t test_vrmlsldavhaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlsldavhaxq_p(a, b, c, p);
+#else
+ return vrmlsldavhaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vmlaldavq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmlaldavq(a, b);
+#else
+ return vmlaldavq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vmlaldavq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmlaldavq(a, b);
+#else
+ return vmlaldavq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+uint64_t test_vmlaldavq_u16(uint16x8_t a, uint16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmlaldavq(a, b);
+#else
+ return vmlaldavq_u16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+uint64_t test_vmlaldavq_u32(uint32x4_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmlaldavq(a, b);
+#else
+ return vmlaldavq_u32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavxq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vmlaldavxq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmlaldavxq(a, b);
+#else
+ return vmlaldavxq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vmlaldavxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmlaldavxq(a, b);
+#else
+ return vmlaldavxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vmlsldavq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmlsldavq(a, b);
+#else
+ return vmlsldavq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vmlsldavq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmlsldavq(a, b);
+#else
+ return vmlsldavq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavxvq_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vmlsldavxvq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+ return vmlsldavxq(a, b);
+#else
+ return vmlsldavxq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vmlsldavxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vmlsldavxq(a, b);
+#else
+ return vmlsldavxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vrmlaldavhq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhq(a, b);
+#else
+ return vrmlaldavhq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhq_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+uint64_t test_vrmlaldavhq_u32(uint32x4_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhq(a, b);
+#else
+ return vrmlaldavhq_u32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vrmlaldavhxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhxq(a, b);
+#else
+ return vrmlaldavhxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vrmlsldavhq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vrmlsldavhq(a, b);
+#else
+ return vrmlsldavhq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhxq_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+int64_t test_vrmlsldavhxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+ return vrmlsldavhxq(a, b);
+#else
+ return vrmlsldavhxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vmlaldavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavq_p(a, b, p);
+#else
+ return vmlaldavq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vmlaldavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavq_p(a, b, p);
+#else
+ return vmlaldavq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_p_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+uint64_t test_vmlaldavq_p_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavq_p(a, b, p);
+#else
+ return vmlaldavq_p_u16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+uint64_t test_vmlaldavq_p_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavq_p(a, b, p);
+#else
+ return vmlaldavq_p_u32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavxq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vmlaldavxq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavxq_p(a, b, p);
+#else
+ return vmlaldavxq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vmlaldavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlaldavxq_p(a, b, p);
+#else
+ return vmlaldavxq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vmlsldavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsldavq_p(a, b, p);
+#else
+ return vmlsldavq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vmlsldavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsldavq_p(a, b, p);
+#else
+ return vmlsldavq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldaxvq_p_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vmlsldaxvq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsldavxq_p(a, b, p);
+#else
+ return vmlsldavxq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vmlsldavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vmlsldavxq_p(a, b, p);
+#else
+ return vmlsldavxq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vrmlaldavhq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhq_p(a, b, p);
+#else
+ return vrmlaldavhq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhq_p_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+uint64_t test_vrmlaldavhq_p_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhq_p(a, b, p);
+#else
+ return vrmlaldavhq_p_u32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vrmlaldavhxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlaldavhxq_p(a, b, p);
+#else
+ return vrmlaldavhxq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vrmlsldavhq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlsldavhq_p(a, b, p);
+#else
+ return vrmlsldavhq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhxq_p_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT: ret i64 [[TMP8]]
+//
+int64_t test_vrmlsldavhxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+ return vrmlsldavhxq_p(a, b, p);
+#else
+ return vrmlsldavhxq_p_s32(a, b, p);
+#endif
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index a10620612e23..f76ba5635127 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1046,4 +1046,51 @@ def int_arm_mve_vst4q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMat
def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+// MVE vector absolute
diff erence and accumulate across vector
+// The first operand is an 'unsigned' flag. The remaining operands are:
+// * accumulator
+// * first vector operand
+// * second vector operand
+// * mask (only in predicated versions)
+defm int_arm_mve_vabav: MVEPredicated<
+ [llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty,
+ [IntrNoMem]>;
+
+// The following 3 instrinsics are MVE vector reductions with two vector
+// operands.
+// The first 3 operands are boolean flags (must be compile-time constants):
+// * unsigned - the instruction operates on vectors of unsigned values and
+// unsigned scalars
+// * subtract - the instruction performs subtraction after multiplication of
+// lane pairs (e.g., vmlsdav vs vmladav)
+// * exchange - the instruction exchanges successive even and odd lanes of
+// the first operands before multiplication of lane pairs
+// (e.g., vmladavx vs vmladav)
+// The remaining operands are:
+// * accumulator
+// * first vector operand
+// * second vector operand
+// * mask (only in predicated versions)
+
+// Version with 32-bit result, vml{a,s}dav[a][x]
+defm int_arm_mve_vmldava: MVEPredicated<
+ [llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+ llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
+ llvm_anyvector_ty, [IntrNoMem]>;
+
+// Version with 64-bit result, vml{a,s}ldav[a][x]
+defm int_arm_mve_vmlldava: MVEPredicated<
+ [llvm_i32_ty, llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+ llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
+ llvm_anyvector_ty, [IntrNoMem]>;
+
+// Version with 72-bit rounded result, vrml{a,s}ldavh[a][x]
+defm int_arm_mve_vrmlldavha: MVEPredicated<
+ [llvm_i32_ty, llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+ llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
+ llvm_anyvector_ty, [IntrNoMem]>;
} // end TargetPrefix
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6dd56b35d0ab..acbbf20f3ef9 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -237,6 +237,27 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
uint16_t OpcodeWithNoCarry, bool Add, bool Predicated);
+ /// Select long MVE vector reductions with two vector operands
+ /// Stride is the number of vector element widths the instruction can operate
+ /// on:
+ /// 2 for long non-rounding variants, vml{a,s}ldav[a][x]: [i16, i32]
+ /// 1 for long rounding variants: vrml{a,s}ldavh[a][x]: [i32]
+ /// Stride is used when addressing the OpcodesS array which contains multiple
+ /// opcodes for each element width.
+ /// TySize is the index into the list of element types listed above
+ void SelectBaseMVE_VMLLDAV(SDNode *N, bool Predicated,
+ const uint16_t *OpcodesS, const uint16_t *OpcodesU,
+ size_t Stride, size_t TySize);
+
+ /// Select a 64-bit MVE vector reduction with two vector operands
+ /// arm_mve_vmlldava_[predicated]
+ void SelectMVE_VMLLDAV(SDNode *N, bool Predicated, const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU);
+ /// Select a 72-bit MVE vector rounding reduction with two vector operands
+ /// int_arm_mve_vrmlldavha[_predicated]
+ void SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated, const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU);
+
/// SelectMVE_VLD - Select MVE interleaving load intrinsics. NumVecs
/// should be 2 or 4. The opcode array specifies the instructions
/// used for 8, 16 and 32-bit lane sizes respectively, and each
@@ -2531,6 +2552,96 @@ void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
}
+static bool SDValueToConstBool(SDValue SDVal) {
+ assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant");
+ ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal);
+ uint64_t Value = SDValConstant->getZExtValue();
+ assert((Value == 0 || Value == 1) && "expected value 0 or 1");
+ return Value;
+}
+
+void ARMDAGToDAGISel::SelectBaseMVE_VMLLDAV(SDNode *N, bool Predicated,
+ const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU,
+ size_t Stride, size_t TySize) {
+ assert(TySize < Stride && "Invalid TySize");
+ bool IsUnsigned = SDValueToConstBool(N->getOperand(1));
+ bool IsSub = SDValueToConstBool(N->getOperand(2));
+ bool IsExchange = SDValueToConstBool(N->getOperand(3));
+ if (IsUnsigned) {
+ assert(!IsSub &&
+ "Unsigned versions of vmlsldav[a]/vrmlsldavh[a] do not exist");
+ assert(!IsExchange &&
+ "Unsigned versions of vmlaldav[a]x/vrmlaldavh[a]x do not exist");
+ }
+
+ auto OpIsZero = [N](size_t OpNo) {
+ if (ConstantSDNode *OpConst = dyn_cast<ConstantSDNode>(N->getOperand(OpNo)))
+ if (OpConst->getZExtValue() == 0)
+ return true;
+ return false;
+ };
+
+ // If the input accumulator value is not zero, select an instruction with
+ // accumulator, otherwise select an instruction without accumulator
+ bool IsAccum = !(OpIsZero(4) && OpIsZero(5));
+
+ const uint16_t *Opcodes = IsUnsigned ? OpcodesU : OpcodesS;
+ if (IsSub)
+ Opcodes += 4 * Stride;
+ if (IsExchange)
+ Opcodes += 2 * Stride;
+ if (IsAccum)
+ Opcodes += Stride;
+ uint16_t Opcode = Opcodes[TySize];
+
+ SDLoc Loc(N);
+ SmallVector<SDValue, 8> Ops;
+ // Push the accumulator operands, if they are used
+ if (IsAccum) {
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(N->getOperand(5));
+ }
+ // Push the two vector operands
+ Ops.push_back(N->getOperand(6));
+ Ops.push_back(N->getOperand(7));
+
+ if (Predicated)
+ AddMVEPredicateToOps(Ops, Loc, N->getOperand(8));
+ else
+ AddEmptyMVEPredicateToOps(Ops, Loc);
+
+ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
+void ARMDAGToDAGISel::SelectMVE_VMLLDAV(SDNode *N, bool Predicated,
+ const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU) {
+ EVT VecTy = N->getOperand(6).getValueType();
+ size_t SizeIndex;
+ switch (VecTy.getVectorElementType().getSizeInBits()) {
+ case 16:
+ SizeIndex = 0;
+ break;
+ case 32:
+ SizeIndex = 1;
+ break;
+ default:
+ llvm_unreachable("bad vector element size");
+ }
+
+ SelectBaseMVE_VMLLDAV(N, Predicated, OpcodesS, OpcodesU, 2, SizeIndex);
+}
+
+void ARMDAGToDAGISel::SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated,
+ const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU) {
+ EVT VecTy = N->getOperand(6).getValueType();
+ assert(VecTy.getVectorElementType().getSizeInBits() == 32 &&
+ "bad vector element size");
+ SelectBaseMVE_VMLLDAV(N, Predicated, OpcodesS, OpcodesU, 1, 0);
+}
+
void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs,
const uint16_t *const *Opcodes) {
EVT VT = N->getValueType(0);
@@ -4376,6 +4487,42 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
IntNo == Intrinsic::arm_mve_vadc_predicated);
return;
+ case Intrinsic::arm_mve_vmlldava:
+ case Intrinsic::arm_mve_vmlldava_predicated: {
+ static const uint16_t OpcodesU[] = {
+ ARM::MVE_VMLALDAVu16, ARM::MVE_VMLALDAVu32,
+ ARM::MVE_VMLALDAVau16, ARM::MVE_VMLALDAVau32,
+ };
+ static const uint16_t OpcodesS[] = {
+ ARM::MVE_VMLALDAVs16, ARM::MVE_VMLALDAVs32,
+ ARM::MVE_VMLALDAVas16, ARM::MVE_VMLALDAVas32,
+ ARM::MVE_VMLALDAVxs16, ARM::MVE_VMLALDAVxs32,
+ ARM::MVE_VMLALDAVaxs16, ARM::MVE_VMLALDAVaxs32,
+ ARM::MVE_VMLSLDAVs16, ARM::MVE_VMLSLDAVs32,
+ ARM::MVE_VMLSLDAVas16, ARM::MVE_VMLSLDAVas32,
+ ARM::MVE_VMLSLDAVxs16, ARM::MVE_VMLSLDAVxs32,
+ ARM::MVE_VMLSLDAVaxs16, ARM::MVE_VMLSLDAVaxs32,
+ };
+ SelectMVE_VMLLDAV(N, IntNo == Intrinsic::arm_mve_vmlldava_predicated,
+ OpcodesS, OpcodesU);
+ return;
+ }
+
+ case Intrinsic::arm_mve_vrmlldavha:
+ case Intrinsic::arm_mve_vrmlldavha_predicated: {
+ static const uint16_t OpcodesU[] = {
+ ARM::MVE_VRMLALDAVHu32, ARM::MVE_VRMLALDAVHau32,
+ };
+ static const uint16_t OpcodesS[] = {
+ ARM::MVE_VRMLALDAVHs32, ARM::MVE_VRMLALDAVHas32,
+ ARM::MVE_VRMLALDAVHxs32, ARM::MVE_VRMLALDAVHaxs32,
+ ARM::MVE_VRMLSLDAVHs32, ARM::MVE_VRMLSLDAVHas32,
+ ARM::MVE_VRMLSLDAVHxs32, ARM::MVE_VRMLSLDAVHaxs32,
+ };
+ SelectMVE_VRMLLDAVH(N, IntNo == Intrinsic::arm_mve_vrmlldavha_predicated,
+ OpcodesS, OpcodesU);
+ return;
+ }
}
break;
}
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 21f0d5e86790..a40231c4aa13 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -567,10 +567,10 @@ class MVE_rDest<dag oops, dag iops, InstrItinClass itin,
let Inst{4} = 0b0;
}
-class MVE_VABAV<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+class MVE_VABAV<string suffix, bit U, bits<2> size>
: MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm),
NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src",
- pattern> {
+ []> {
bits<4> Qm;
bits<4> Qn;
bits<4> Rda;
@@ -589,12 +589,36 @@ class MVE_VABAV<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
let Inst{0} = 0b1;
}
-def MVE_VABAVs8 : MVE_VABAV<"s8", 0b0, 0b00>;
-def MVE_VABAVs16 : MVE_VABAV<"s16", 0b0, 0b01>;
-def MVE_VABAVs32 : MVE_VABAV<"s32", 0b0, 0b10>;
-def MVE_VABAVu8 : MVE_VABAV<"u8", 0b1, 0b00>;
-def MVE_VABAVu16 : MVE_VABAV<"u16", 0b1, 0b01>;
-def MVE_VABAVu32 : MVE_VABAV<"u32", 0b1, 0b10>;
+multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> {
+ def "" : MVE_VABAV<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 (int_arm_mve_vabav
+ (i32 VTI.Unsigned),
+ (i32 rGPR:$Rda_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (i32 (!cast<Instruction>(NAME)
+ (i32 rGPR:$Rda_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
+
+ def : Pat<(i32 (int_arm_mve_vabav_predicated
+ (i32 VTI.Unsigned),
+ (i32 rGPR:$Rda_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (i32 (!cast<Instruction>(NAME)
+ (i32 rGPR:$Rda_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+ }
+}
+
+defm MVE_VABAVs8 : MVE_VABAV_m<MVE_v16s8>;
+defm MVE_VABAVs16 : MVE_VABAV_m<MVE_v8s16>;
+defm MVE_VABAVs32 : MVE_VABAV_m<MVE_v4s32>;
+defm MVE_VABAVu8 : MVE_VABAV_m<MVE_v16u8>;
+defm MVE_VABAVu16 : MVE_VABAV_m<MVE_v8u16>;
+defm MVE_VABAVu32 : MVE_VABAV_m<MVE_v4u32>;
class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
bit A, bit U, bits<2> size, list<dag> pattern=[]>
@@ -803,10 +827,9 @@ defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>;
defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>;
class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
- bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
- list<dag> pattern=[]>
+ bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
: MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix,
- "$RdaDest, $Qn, $Qm", cstr, pattern> {
+ "$RdaDest, $Qn, $Qm", cstr, []> {
bits<4> RdaDest;
bits<3> Qm;
bits<3> Qn;
@@ -824,47 +847,88 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
let Inst{0} = bit_0;
}
-multiclass MVE_VMLAMLSDAV_A<string iname, string x, string suffix,
- bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
- list<dag> pattern=[]> {
- def ""#x#suffix : MVE_VMLAMLSDAV<iname # x, suffix,
+multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
+ bit sz, bit bit_28, bit X, bit bit_8, bit bit_0> {
+ def ""#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # x, VTI.Suffix,
(ins MQPR:$Qn, MQPR:$Qm), "",
- sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
- def "a"#x#suffix : MVE_VMLAMLSDAV<iname # "a" # x, suffix,
+ sz, bit_28, 0b0, X, bit_8, bit_0>;
+ def "a"#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # "a" # x, VTI.Suffix,
(ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
"$RdaDest = $RdaSrc",
- sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
+ sz, bit_28, 0b1, X, bit_8, bit_0>;
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 (int_arm_mve_vmldava
+ (i32 VTI.Unsigned),
+ (i32 bit_0) /* subtract */,
+ (i32 X) /* exchange */,
+ (i32 0) /* accumulator */,
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (i32 (!cast<Instruction>(NAME # x # VTI.Suffix)
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
+
+ def : Pat<(i32 (int_arm_mve_vmldava_predicated
+ (i32 VTI.Unsigned),
+ (i32 bit_0) /* subtract */,
+ (i32 X) /* exchange */,
+ (i32 0) /* accumulator */,
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (i32 (!cast<Instruction>(NAME # x # VTI.Suffix)
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+
+ def : Pat<(i32 (int_arm_mve_vmldava
+ (i32 VTI.Unsigned),
+ (i32 bit_0) /* subtract */,
+ (i32 X) /* exchange */,
+ (i32 tGPREven:$RdaSrc),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix)
+ (i32 tGPREven:$RdaSrc),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
+
+ def : Pat<(i32 (int_arm_mve_vmldava_predicated
+ (i32 VTI.Unsigned),
+ (i32 bit_0) /* subtract */,
+ (i32 X) /* exchange */,
+ (i32 tGPREven:$RdaSrc),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix)
+ (i32 tGPREven:$RdaSrc),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+ }
}
-multiclass MVE_VMLAMLSDAV_AX<string iname, string suffix, bit sz, bit bit_28,
- bit bit_8, bit bit_0, list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_A<iname, "", suffix, sz, bit_28,
- 0b0, bit_8, bit_0, pattern>;
- defm "" : MVE_VMLAMLSDAV_A<iname, "x", suffix, sz, bit_28,
- 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLAMLSDAV_AX<string iname, MVEVectorVTInfo VTI, bit sz,
+ bit bit_28, bit bit_8, bit bit_0> {
+ defm "" : MVE_VMLAMLSDAV_A<iname, "", VTI, sz, bit_28,
+ 0b0, bit_8, bit_0>;
+ defm "" : MVE_VMLAMLSDAV_A<iname, "x", VTI, sz, bit_28,
+ 0b1, bit_8, bit_0>;
}
-multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit bit_8,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix,
- sz, 0b0, bit_8, 0b0, pattern>;
- defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix,
- sz, 0b1, 0b0, bit_8, 0b0, pattern>;
+multiclass MVE_VMLADAV_multi<MVEVectorVTInfo SVTI, MVEVectorVTInfo UVTI,
+ bit sz, bit bit_8> {
+ defm "" : MVE_VMLAMLSDAV_AX<"vmladav", SVTI,
+ sz, 0b0, bit_8, 0b0>;
+ defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", UVTI,
+ sz, 0b1, 0b0, bit_8, 0b0>;
}
-multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix,
- sz, bit_28, 0b0, 0b1, pattern>;
+multiclass MVE_VMLSDAV_multi<MVEVectorVTInfo VTI, bit sz, bit bit_28> {
+ defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", VTI,
+ sz, bit_28, 0b0, 0b1>;
}
-defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>;
-defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>;
-defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v16s8, MVE_v16u8, 0b0, 0b1>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v8s16, MVE_v8u16, 0b0, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v4s32, MVE_v4u32, 0b1, 0b0>;
-defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>;
-defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>;
-defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v16s8, 0b0, 0b1>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v8s16, 0b0, 0b0>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v4s32, 0b1, 0b0>;
// vmlav aliases vmladav
foreach acc = ["", "a"] in {
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll
new file mode 100644
index 000000000000..f7fb69e8c311
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare i32 @llvm.arm.mve.vabav.v16i8(i32, i32, <16 x i8>, <16 x i8>)
+declare i32 @llvm.arm.mve.vabav.v8i16(i32, i32, <8 x i16>, <8 x i16>)
+declare i32 @llvm.arm.mve.vabav.v4i32(i32, i32, <4 x i32>, <4 x i32>)
+
+declare i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32, i32, <16 x i8>, <16 x i8>, <16 x i1>)
+declare i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32, i32, <8 x i16>, <8 x i16>, <8 x i1>)
+declare i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32, i32, <4 x i32>, <4 x i32>, <4 x i1>)
+
+define arm_aapcs_vfpcc i32 @test_vabavq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vabavq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vabav.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vabav.v16i8(i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vabavq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vabav.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vabav.v8i16(i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vabavq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vabav.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vabav.v4i32(i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_u8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vabavq_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vabav.u8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vabav.v16i8(i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_u16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vabavq_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vabav.u16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vabav.v8i16(i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_u32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vabavq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vabav.u32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vabav.v4i32(i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vabavt.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vabavt.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vabavt.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_u8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vabavt.u8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_u16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vabavt.u16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_u32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vabavt.u32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+ ret i32 %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll
new file mode 100644
index 000000000000..ae8f24386b9c
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll
@@ -0,0 +1,734 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare i32 @llvm.arm.mve.vmldava.v16i8(i32, i32, i32, i32, <16 x i8>, <16 x i8>)
+declare i32 @llvm.arm.mve.vmldava.v8i16(i32, i32, i32, i32, <8 x i16>, <8 x i16>)
+declare i32 @llvm.arm.mve.vmldava.v4i32(i32, i32, i32, i32, <4 x i32>, <4 x i32>)
+
+declare i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32, i32, i32, i32, <16 x i8>, <16 x i8>, <16 x i1>)
+declare i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32, i32, i32, i32, <8 x i16>, <8 x i16>, <8 x i1>)
+declare i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>)
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmladavaq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlava.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmladavaq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlava.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmladavaq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlava.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_u8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmladavaq_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlava.u8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_u16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmladavaq_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlava.u16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_u32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmladavaq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlava.u32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmladavaxq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmladavax.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmladavaxq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmladavax.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmladavaxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmladavax.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlsdavaq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdava.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsdavaq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdava.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsdavaq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdava.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlsdavaxq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdavax.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsdavaxq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdavax.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsdavaxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdavax.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavat.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavat.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavat.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_u8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavat.u8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_u16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavat.u16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_u32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavat.u32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaxq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmladavaxt.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaxq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmladavaxt.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmladavaxt.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavat.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavat.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavat.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaxq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavaxt.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaxq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavaxt.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavaxt.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmladavq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlav.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmladavq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlav.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmladavq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlav.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmladavq_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlav.u8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmladavq_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlav.u16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmladavq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlav.u32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmladavxq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmladavx.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmladavxq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmladavx.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmladavxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmladavx.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmlsdavq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdav.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlsdavq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdav.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlsdavq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdav.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmlsdavxq_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdavx.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlsdavxq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdavx.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlsdavxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsdavx.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavt.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavt.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavt.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavt.u8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavt.u16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlavt.u32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavxq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmladavxt.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavxq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmladavxt.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmladavxt.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavt.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavt.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavt.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavxq_p_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavxt.s8 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavxq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavxt.s16 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsdavxt.s32 r0, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ ret i32 %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll
new file mode 100644
index 000000000000..9a215b369335
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll
@@ -0,0 +1,1183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>)
+declare { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>)
+declare { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>)
+
+declare { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>, <8 x i1>)
+declare { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>)
+declare { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>)
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlaldavaq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlaldavaq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlalva.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_u16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlaldavaq_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_u32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlaldavaq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlalva.u32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlaldavaxq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlaldavax.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlaldavaxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlaldavax.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsldavaq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsldava.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsldavaq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsldava.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldaxvaq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsldaxvaq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsldavax.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsldavaxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsldavax.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlaldavhaq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlalvha.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_u32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlaldavhaq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlalvha.u32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlaldavhaxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlaldavhax.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlsldavhaq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlsldavha.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlsldavhaxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlsldavhax.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlalvat.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_u16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaq_p_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_u32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlalvat.u32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaxq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlaldavaxt.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlaldavaxt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavaq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsldavat.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavaq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsldavat.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldaxvaq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldaxvaq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsldavaxt.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavaxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsldavaxt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhaq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlalvhat.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_p_u32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhaq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlalvhat.u32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhaxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlaldavhaxt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlsldavhaq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlsldavhat.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlsldavhaxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlsldavhaxt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %a, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %a to i32
+ %3 = zext i16 %p to i32
+ %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+ %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+ %6 = extractvalue { i32, i32 } %5, 1
+ %7 = zext i32 %6 to i64
+ %8 = shl i64 %7, 32
+ %9 = extractvalue { i32, i32 } %5, 0
+ %10 = zext i32 %9 to i64
+ %11 = or i64 %8, %10
+ ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlaldavq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlaldavq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlaldavq_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlaldavq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavxq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlaldavxq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlaldavx.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlaldavxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlaldavx.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlsldavq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsldav.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlsldavq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsldav.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavxvq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlsldavxvq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsldavx.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlsldavxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmlsldavx.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlaldavhq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlalvh.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlaldavhq_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlalvh.u32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlaldavhxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlaldavhx.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlsldavhq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlsldavh.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlsldavhxq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrmlsldavhx.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavq_p_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavxq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavxq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlaldavxt.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlaldavxt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsldavt.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsldavt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldaxvq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldaxvq_p_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsldavxt.s16 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmlsldavxt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlalvht.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_p_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhq_p_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlalvht.u32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlaldavhxt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlsldavhq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlsldavht.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlsldavhxq_p_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vrmlsldavhxt.s32 r0, r1, q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+ %3 = extractvalue { i32, i32 } %2, 1
+ %4 = zext i32 %3 to i64
+ %5 = shl i64 %4, 32
+ %6 = extractvalue { i32, i32 } %2, 0
+ %7 = zext i32 %6 to i64
+ %8 = or i64 %5, %7
+ ret i64 %8
+}
More information about the cfe-commits
mailing list