[clang] 99581fd - [ARM][MVE] Add vector reduction intrinsics with two vector operands

Fri Dec 13 05:18:02 PST 2019

Author: Mikhail Maltsev
Date: 2019-12-13T13:17:29Z
New Revision: 99581fd4c8e12f5eca38e7cfc5992508a9bfe383

URL: https://github.com/llvm/llvm-project/commit/99581fd4c8e12f5eca38e7cfc5992508a9bfe383
DIFF: https://github.com/llvm/llvm-project/commit/99581fd4c8e12f5eca38e7cfc5992508a9bfe383.diff

LOG: [ARM][MVE] Add vector reduction intrinsics with two vector operands

Summary:
This patch adds intrinsics for the following MVE instructions:
* VABAV
* VMLADAV, VMLSDAV
* VMLALDAV, VMLSLDAV
* VRMLALDAVH, VRMLSLDAVH

Each of the above 4 groups has a corresponding new LLVM IR intrinsic,
since the instructions cannot be easily represented using
general-purpose IR operations.

Reviewers: simon_tatham, ostannard, dmgreen, MarkMurrayARM

Reviewed By: MarkMurrayARM

Subscribers: merge_guards_bot, kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D71062

Added: 
    clang/test/CodeGen/arm-mve-intrinsics/vabavq.c
    clang/test/CodeGen/arm-mve-intrinsics/vmldav.c
    clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll

Modified: 
    clang/include/clang/Basic/arm_mve.td
    clang/include/clang/Basic/arm_mve_defs.td
    llvm/include/llvm/IR/IntrinsicsARM.td
    llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
    llvm/lib/Target/ARM/ARMInstrMVE.td

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 9d9c067ade1c..6a27bdb807a9 100644

--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -775,6 +775,118 @@ defm vcmulq : VectorComplexMulAngle;
 defm vcmlaq : VectorComplexMLAAngle;
 }
 
+multiclass MVEBinaryVectorHoriz32<dag subtract, dag exchange, string xsuffix> {
+  def xsuffix#"q"
+    : Intrinsic<Scalar32, (args Vector:$a, Vector:$b),
+                          (IRInt<"vmldava", [Vector]>
+                           (unsignedflag Scalar), subtract, exchange,
+                           (zeroinit Scalar32), $a, $b)>;
+  def xsuffix#"q_p"
+    : Intrinsic<Scalar32, (args Vector:$a, Vector:$b, Predicate:$pred),
+                          (IRInt<"vmldava_predicated", [Vector, Predicate]>
+                           (unsignedflag Scalar), subtract, exchange,
+                           (zeroinit Scalar32), $a, $b, $pred)>;
+
+  def "a"#xsuffix#"q"
+    : Intrinsic<Scalar32, (args Scalar32:$a, Vector:$b, Vector:$c),
+                          (IRInt<"vmldava", [Vector]>
+                           (unsignedflag Scalar), subtract, exchange,
+                           $a, $b, $c)>;
+  def "a"#xsuffix#"q_p"
+    : Intrinsic<Scalar32, (args Scalar32:$a, Vector:$b, Vector:$c,
+                                Predicate:$pred),
+                          (IRInt<"vmldava_predicated", [Vector, Predicate]>
+                           (unsignedflag Scalar), subtract, exchange,
+                           $a, $b, $c, $pred)>;
+}
+
+class IntrSplit64<Type resty, dag args, dag codegen>
+  : Intrinsic<resty, args,
+              (seq (u32 (lshr $a, (u64 32))):$hi,
+                   (u32 $a):$lo,
+                   codegen:$pair,
+                   (or (shl (u64 (xval $pair, 1)), (u64 32)),
+                            (u64 (xval $pair, 0))))>;
+
+class IntrSplit64ZeroInit<Type resty, dag args, dag codegen>
+  : Intrinsic<resty, args,
+              (seq (zeroinit u32):$hi,
+                   (zeroinit u32):$lo,
+                   codegen:$pair,
+                   (or (shl (u64 (xval $pair, 1)), (u64 32)),
+                            (u64 (xval $pair, 0))))>;
+
+multiclass MVEBinaryVectorHoriz64Base<dag subtract, dag exchange,
+                                      string xsuffix, string irname> {
+  def xsuffix#"q"
+    : IntrSplit64ZeroInit<Scalar64, (args Vector:$a, Vector:$b),
+                          (IRInt<irname, [Vector]>
+                           (unsignedflag Scalar), subtract, exchange,
+                           $lo, $hi, $a, $b)>;
+  def xsuffix#"q_p"
+    : IntrSplit64ZeroInit<Scalar64, (args Vector:$a, Vector:$b,
+                                          Predicate:$pred),
+                          (IRInt<irname#"_predicated", [Vector, Predicate]>
+                           (unsignedflag Scalar), subtract, exchange,
+                           $lo, $hi, $a, $b, $pred)>;
+
+  def "a"#xsuffix#"q"
+    : IntrSplit64<Scalar64, (args Scalar64:$a, Vector:$b, Vector:$c),
+                          (IRInt<irname, [Vector]>
+                           (unsignedflag Scalar), subtract, exchange,
+                           $lo, $hi, $b, $c)>;
+  def "a"#xsuffix#"q_p"
+    : IntrSplit64<Scalar64, (args Scalar64:$a, Vector:$b, Vector:$c,
+                                  Predicate:$pred),
+                          (IRInt<irname#"_predicated", [Vector, Predicate]>
+                           (unsignedflag Scalar), subtract, exchange,
+                           $lo, $hi, $b, $c, $pred)>;
+}
+
+multiclass MVEBinaryVectorHoriz64<dag subtract, dag exchange, string xsuffix> {
+  defm "" : MVEBinaryVectorHoriz64Base<subtract, exchange, xsuffix, "vmlldava">;
+}
+
+multiclass MVEBinaryVectorHoriz64R<dag subtract, dag exchange, string xsuffix> {
+  defm "" : MVEBinaryVectorHoriz64Base<subtract, exchange, xsuffix,
+                                       "vrmlldavha">;
+}
+
+let params = T.Int in {
+def vabavq : Intrinsic<u32, (args u32:$a, Vector:$b, Vector:$c),
+    (IRInt<"vabav", [Vector]> (unsignedflag Scalar), $a, $b, $c)>;
+def vabavq_p : Intrinsic<u32, (args u32:$a, Vector:$b, Vector:$c,
+                                    Predicate:$pred),
+    (IRInt<"vabav_predicated", [Vector, Predicate]>
+                               (unsignedflag Scalar), $a, $b, $c, $pred)>;
+
+defm vmladav  : MVEBinaryVectorHoriz32<V.False, V.False, "">;
+}
+
+let params = T.Signed in {
+defm vmladav : MVEBinaryVectorHoriz32<V.False, V.True, "x">;
+defm vmlsdav : MVEBinaryVectorHoriz32<V.True, V.False, "">;
+defm vmlsdav : MVEBinaryVectorHoriz32<V.True, V.True, "x">;
+}
+
+let params = [u16, s16, u32, s32] in
+defm vmlaldav : MVEBinaryVectorHoriz64<V.False, V.False, "">;
+
+let params = [s16, s32] in {
+defm vmlaldav : MVEBinaryVectorHoriz64<V.False, V.True, "x">;
+defm vmlsldav : MVEBinaryVectorHoriz64<V.True, V.False, "">;
+defm vmlsldav : MVEBinaryVectorHoriz64<V.True, V.True, "x">;
+}
+
+let params = T.Int32 in
+defm vrmlaldavh : MVEBinaryVectorHoriz64R<V.False, V.False, "">;
+
+let params = [s32] in {
+defm vrmlaldavh : MVEBinaryVectorHoriz64R<V.False, V.True, "x">;
+defm vrmlsldavh : MVEBinaryVectorHoriz64R<V.True, V.False, "">;
+defm vrmlsldavh : MVEBinaryVectorHoriz64R<V.True, V.True, "x">;
+}
+
 foreach desttype = T.All in {
   // We want a vreinterpretq between every pair of supported vector types
   // _except_ that there shouldn't be one from a type to itself.

diff  --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index 3e22e44607ca..03472fb47b6c 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -284,6 +284,11 @@ def UVector: VecOf<UScalar>;
 // Scalar.
 def DblVector: VecOf<DoubleSize<Scalar>>;
 
+// Expands to the 32-bit integer of the same signedness as Scalar.
+def Scalar32: CopyKind<u32, Scalar>;
+// Expands to the 64-bit integer of the same signedness as Scalar.
+def Scalar64: CopyKind<u64, Scalar>;
+
 // -----------------------------------------------------------------------------
 // Internal definitions for specifying immediate arguments for an intrinsic.
 
@@ -478,3 +483,13 @@ def T {
   list<Type> All64 = Int64;
   list<Type> All = Usual # All64;
 }
+
+// -----------------------------------------------------------------------------
+// Container record for DAG constant values. These constants are used because
+// bit/int class/multiclass parameters cannot be used to produce a dag node:
+// for example (u32 x) where x is 0 is transformed into (u32 { 0 }) by the
+// Tablegen parser.
+def V {
+  dag False = (u32 0);
+  dag True  = (u32 1);
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c
new file mode 100644
index 000000000000..e1362760bb8f
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c
@@ -0,0 +1,173 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vabavq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v16i8(i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_s8(uint32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+  return vabavq(a, b, c);
+#else
+  return vabavq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v8i16(i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_s16(uint32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+  return vabavq(a, b, c);
+#else
+  return vabavq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v4i32(i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_s32(uint32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vabavq(a, b, c);
+#else
+  return vabavq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v16i8(i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_u8(uint32_t a, uint8x16_t b, uint8x16_t c) {
+#ifdef POLYMORPHIC
+  return vabavq(a, b, c);
+#else
+  return vabavq_u8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v8i16(i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_u16(uint32_t a, uint16x8_t b, uint16x8_t c) {
+#ifdef POLYMORPHIC
+  return vabavq(a, b, c);
+#else
+  return vabavq_u16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vabav.v4i32(i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vabavq_u32(uint32_t a, uint32x4_t b, uint32x4_t c) {
+#ifdef POLYMORPHIC
+  return vabavq(a, b, c);
+#else
+  return vabavq_u32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_s8(uint32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vabavq_p(a, b, c, p);
+#else
+  return vabavq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_s16(uint32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vabavq_p(a, b, c, p);
+#else
+  return vabavq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_s32(uint32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vabavq_p(a, b, c, p);
+#else
+  return vabavq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_u8(uint32_t a, uint8x16_t b, uint8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vabavq_p(a, b, c, p);
+#else
+  return vabavq_p_u8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_u16(uint32_t a, uint16x8_t b, uint16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vabavq_p(a, b, c, p);
+#else
+  return vabavq_p_u16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vabavq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vabavq_p_u32(uint32_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vabavq_p(a, b, c, p);
+#else
+  return vabavq_p_u32(a, b, c, p);
+#endif
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c b/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c
new file mode 100644
index 000000000000..60339ff8db56
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c
@@ -0,0 +1,845 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vmladavaq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavaq_s8(int32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+  return vmladavaq(a, b, c);
+#else
+  return vmladavaq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavaq_s16(int32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmladavaq(a, b, c);
+#else
+  return vmladavaq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavaq_s32(int32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmladavaq(a, b, c);
+#else
+  return vmladavaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmladavaq_u8(uint32_t a, uint8x16_t b, uint8x16_t c) {
+#ifdef POLYMORPHIC
+  return vmladavaq(a, b, c);
+#else
+  return vmladavaq_u8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmladavaq_u16(uint32_t a, uint16x8_t b, uint16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmladavaq(a, b, c);
+#else
+  return vmladavaq_u16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmladavaq_u32(uint32_t a, uint32x4_t b, uint32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmladavaq(a, b, c);
+#else
+  return vmladavaq_u32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavaxq_s8(int32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+  return vmladavaxq(a, b, c);
+#else
+  return vmladavaxq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavaxq_s16(int32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmladavaxq(a, b, c);
+#else
+  return vmladavaxq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavaxq_s32(int32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmladavaxq(a, b, c);
+#else
+  return vmladavaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaq_s8(int32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+  return vmlsdavaq(a, b, c);
+#else
+  return vmlsdavaq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaq_s16(int32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmlsdavaq(a, b, c);
+#else
+  return vmlsdavaq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaq_s32(int32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmlsdavaq(a, b, c);
+#else
+  return vmlsdavaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaxq_s8(int32_t a, int8x16_t b, int8x16_t c) {
+#ifdef POLYMORPHIC
+  return vmlsdavaxq(a, b, c);
+#else
+  return vmlsdavaxq_s8(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaxq_s16(int32_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmlsdavaxq(a, b, c);
+#else
+  return vmlsdavaxq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavaxq_s32(int32_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmlsdavaxq(a, b, c);
+#else
+  return vmlsdavaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavaq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavaq_p(a, b, c, p);
+#else
+  return vmladavaq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavaq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavaq_p(a, b, c, p);
+#else
+  return vmladavaq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavaq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavaq_p(a, b, c, p);
+#else
+  return vmladavaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmladavaq_p_u8(uint32_t a, uint8x16_t b, uint8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavaq_p(a, b, c, p);
+#else
+  return vmladavaq_p_u8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmladavaq_p_u16(uint32_t a, uint16x8_t b, uint16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavaq_p(a, b, c, p);
+#else
+  return vmladavaq_p_u16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmladavaq_p_u32(uint32_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavaq_p(a, b, c, p);
+#else
+  return vmladavaq_p_u32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavaxq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavaxq_p(a, b, c, p);
+#else
+  return vmladavaxq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavaxq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavaxq_p(a, b, c, p);
+#else
+  return vmladavaxq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavaxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavaxq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavaxq_p(a, b, c, p);
+#else
+  return vmladavaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavaq_p(a, b, c, p);
+#else
+  return vmlsdavaq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavaq_p(a, b, c, p);
+#else
+  return vmlsdavaq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavaq_p(a, b, c, p);
+#else
+  return vmlsdavaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaxq_p_s8(int32_t a, int8x16_t b, int8x16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavaxq_p(a, b, c, p);
+#else
+  return vmlsdavaxq_p_s8(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaxq_p_s16(int32_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavaxq_p(a, b, c, p);
+#else
+  return vmlsdavaxq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavaxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavaxq_p_s32(int32_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavaxq_p(a, b, c, p);
+#else
+  return vmlsdavaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavq_s8(int8x16_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmladavq(a, b);
+#else
+  return vmladavq_s8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmladavq(a, b);
+#else
+  return vmladavq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmladavq(a, b);
+#else
+  return vmladavq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmladavq_u8(uint8x16_t a, uint8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmladavq(a, b);
+#else
+  return vmladavq_u8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmladavq_u16(uint16x8_t a, uint16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmladavq(a, b);
+#else
+  return vmladavq_u16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmladavq_u32(uint32x4_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmladavq(a, b);
+#else
+  return vmladavq_u32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavxq_s8(int8x16_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmladavxq(a, b);
+#else
+  return vmladavxq_s8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavxq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmladavxq(a, b);
+#else
+  return vmladavxq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmladavxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmladavxq(a, b);
+#else
+  return vmladavxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavq_s8(int8x16_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmlsdavq(a, b);
+#else
+  return vmlsdavq_s8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmlsdavq(a, b);
+#else
+  return vmlsdavq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmlsdavq(a, b);
+#else
+  return vmlsdavq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavxq_s8(int8x16_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmlsdavxq(a, b);
+#else
+  return vmlsdavxq_s8(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavxq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmlsdavxq(a, b);
+#else
+  return vmlsdavxq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmlsdavxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmlsdavxq(a, b);
+#else
+  return vmlsdavxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavq_p(a, b, p);
+#else
+  return vmladavq_p_s8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavq_p(a, b, p);
+#else
+  return vmladavq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavq_p(a, b, p);
+#else
+  return vmladavq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmladavq_p_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavq_p(a, b, p);
+#else
+  return vmladavq_p_u8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmladavq_p_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavq_p(a, b, p);
+#else
+  return vmladavq_p_u16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmladavq_p_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavq_p(a, b, p);
+#else
+  return vmladavq_p_u32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavxq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavxq_p(a, b, p);
+#else
+  return vmladavxq_p_s8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavxq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavxq_p(a, b, p);
+#else
+  return vmladavxq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmladavxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmladavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmladavxq_p(a, b, p);
+#else
+  return vmladavxq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavq_p(a, b, p);
+#else
+  return vmlsdavq_p_s8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavq_p(a, b, p);
+#else
+  return vmlsdavq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavq_p(a, b, p);
+#else
+  return vmlsdavq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 0, <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavxq_p_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavxq_p(a, b, p);
+#else
+  return vmlsdavxq_p_s8(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavxq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavxq_p(a, b, p);
+#else
+  return vmlsdavxq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsdavxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmlsdavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsdavxq_p(a, b, p);
+#else
+  return vmlsdavxq_p_s32(a, b, p);
+#endif
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c b/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c
new file mode 100644
index 000000000000..8bc1e2531e98
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c
@@ -0,0 +1,1295 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vmlaldavaq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vmlaldavaq_s16(int64_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmlaldavaq(a, b, c);
+#else
+  return vmlaldavaq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vmlaldavaq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmlaldavaq(a, b, c);
+#else
+  return vmlaldavaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_vmlaldavaq_u16(uint64_t a, uint16x8_t b, uint16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmlaldavaq(a, b, c);
+#else
+  return vmlaldavaq_u16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_vmlaldavaq_u32(uint64_t a, uint32x4_t b, uint32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmlaldavaq(a, b, c);
+#else
+  return vmlaldavaq_u32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaxq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vmlaldavaxq_s16(int64_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmlaldavaxq(a, b, c);
+#else
+  return vmlaldavaxq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vmlaldavaxq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmlaldavaxq(a, b, c);
+#else
+  return vmlaldavaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vmlsldavaq_s16(int64_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmlsldavaq(a, b, c);
+#else
+  return vmlsldavaq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vmlsldavaq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmlsldavaq(a, b, c);
+#else
+  return vmlsldavaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldaxvaq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vmlsldaxvaq_s16(int64_t a, int16x8_t b, int16x8_t c) {
+#ifdef POLYMORPHIC
+  return vmlsldavaxq(a, b, c);
+#else
+  return vmlsldavaxq_s16(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vmlsldavaxq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vmlsldavaxq(a, b, c);
+#else
+  return vmlsldavaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vrmlaldavhaq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhaq(a, b, c);
+#else
+  return vrmlaldavhaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_vrmlaldavhaq_u32(uint64_t a, uint32x4_t b, uint32x4_t c) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhaq(a, b, c);
+#else
+  return vrmlaldavhaq_u32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vrmlaldavhaxq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhaxq(a, b, c);
+#else
+  return vrmlaldavhaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhaq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vrmlsldavhaq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vrmlsldavhaq(a, b, c);
+#else
+  return vrmlsldavhaq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhaxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_vrmlsldavhaxq_s32(int64_t a, int32x4_t b, int32x4_t c) {
+#ifdef POLYMORPHIC
+  return vrmlsldavhaxq(a, b, c);
+#else
+  return vrmlsldavhaxq_s32(a, b, c);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vmlaldavaq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavaq_p(a, b, c, p);
+#else
+  return vmlaldavaq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vmlaldavaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavaq_p(a, b, c, p);
+#else
+  return vmlaldavaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+uint64_t test_vmlaldavaq_p_u16(uint64_t a, uint16x8_t b, uint16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavaq_p(a, b, c, p);
+#else
+  return vmlaldavaq_p_u16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+uint64_t test_vmlaldavaq_p_u32(uint64_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavaq_p(a, b, c, p);
+#else
+  return vmlaldavaq_p_u32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaxq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vmlaldavaxq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavaxq_p(a, b, c, p);
+#else
+  return vmlaldavaxq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavaxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vmlaldavaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavaxq_p(a, b, c, p);
+#else
+  return vmlaldavaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vmlsldavaq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsldavaq_p(a, b, c, p);
+#else
+  return vmlsldavaq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vmlsldavaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsldavaq_p(a, b, c, p);
+#else
+  return vmlsldavaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldaxvaq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vmlsldaxvaq_p_s16(int64_t a, int16x8_t b, int16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsldavaxq_p(a, b, c, p);
+#else
+  return vmlsldavaxq_p_s16(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavaxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vmlsldavaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsldavaxq_p(a, b, c, p);
+#else
+  return vmlsldavaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vrmlaldavhaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhaq_p(a, b, c, p);
+#else
+  return vrmlaldavhaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+uint64_t test_vrmlaldavhaq_p_u32(uint64_t a, uint32x4_t b, uint32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhaq_p(a, b, c, p);
+#else
+  return vrmlaldavhaq_p_u32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhaxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vrmlaldavhaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhaxq_p(a, b, c, p);
+#else
+  return vrmlaldavhaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhaq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vrmlsldavhaq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlsldavhaq_p(a, b, c, p);
+#else
+  return vrmlsldavhaq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhaxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[A:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 [[TMP2]], i32 [[TMP1]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i1> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP5]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 32
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i32 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP10]]
+// CHECK-NEXT:    ret i64 [[TMP11]]
+//
+int64_t test_vrmlsldavhaxq_p_s32(int64_t a, int32x4_t b, int32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlsldavhaxq_p(a, b, c, p);
+#else
+  return vrmlsldavhaxq_p_s32(a, b, c, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vmlaldavq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmlaldavq(a, b);
+#else
+  return vmlaldavq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vmlaldavq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmlaldavq(a, b);
+#else
+  return vmlaldavq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+uint64_t test_vmlaldavq_u16(uint16x8_t a, uint16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmlaldavq(a, b);
+#else
+  return vmlaldavq_u16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+uint64_t test_vmlaldavq_u32(uint32x4_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmlaldavq(a, b);
+#else
+  return vmlaldavq_u32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavxq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vmlaldavxq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmlaldavxq(a, b);
+#else
+  return vmlaldavxq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vmlaldavxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmlaldavxq(a, b);
+#else
+  return vmlaldavxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vmlsldavq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmlsldavq(a, b);
+#else
+  return vmlsldavq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vmlsldavq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmlsldavq(a, b);
+#else
+  return vmlsldavq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavxvq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vmlsldavxvq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmlsldavxq(a, b);
+#else
+  return vmlsldavxq_s16(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vmlsldavxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmlsldavxq(a, b);
+#else
+  return vmlsldavxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vrmlaldavhq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhq(a, b);
+#else
+  return vrmlaldavhq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+uint64_t test_vrmlaldavhq_u32(uint32x4_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhq(a, b);
+#else
+  return vrmlaldavhq_u32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vrmlaldavhxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhxq(a, b);
+#else
+  return vrmlaldavhxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vrmlsldavhq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vrmlsldavhq(a, b);
+#else
+  return vrmlsldavhq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+int64_t test_vrmlsldavhxq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vrmlsldavhxq(a, b);
+#else
+  return vrmlsldavhxq_s32(a, b);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vmlaldavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavq_p(a, b, p);
+#else
+  return vmlaldavq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vmlaldavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavq_p(a, b, p);
+#else
+  return vmlaldavq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+uint64_t test_vmlaldavq_p_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavq_p(a, b, p);
+#else
+  return vmlaldavq_p_u16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+uint64_t test_vmlaldavq_p_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavq_p(a, b, p);
+#else
+  return vmlaldavq_p_u32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavxq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vmlaldavxq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavxq_p(a, b, p);
+#else
+  return vmlaldavxq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlaldavxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vmlaldavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlaldavxq_p(a, b, p);
+#else
+  return vmlaldavxq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vmlsldavq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsldavq_p(a, b, p);
+#else
+  return vmlsldavq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vmlsldavq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsldavq_p(a, b, p);
+#else
+  return vmlsldavq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldaxvq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vmlsldaxvq_p_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsldavxq_p(a, b, p);
+#else
+  return vmlsldavxq_p_s16(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vmlsldavxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vmlsldavxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmlsldavxq_p(a, b, p);
+#else
+  return vmlsldavxq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vrmlaldavhq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhq_p(a, b, p);
+#else
+  return vrmlaldavhq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+uint64_t test_vrmlaldavhq_p_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhq_p(a, b, p);
+#else
+  return vrmlaldavhq_p_u32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlaldavhxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vrmlaldavhxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlaldavhxq_p(a, b, p);
+#else
+  return vrmlaldavhxq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vrmlsldavhq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlsldavhq_p(a, b, p);
+#else
+  return vrmlsldavhq_p_s32(a, b, p);
+#endif
+}
+
+// CHECK-LABEL: @test_vrmlsldavhxq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 32
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+// CHECK-NEXT:    ret i64 [[TMP8]]
+//
+int64_t test_vrmlsldavhxq_p_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vrmlsldavhxq_p(a, b, p);
+#else
+  return vrmlsldavhxq_p_s32(a, b, p);
+#endif
+}

diff  --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index a10620612e23..f76ba5635127 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1046,4 +1046,51 @@ def int_arm_mve_vst4q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMat
 def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
 
+// MVE vector absolute 
diff erence and accumulate across vector
+// The first operand is an 'unsigned' flag. The remaining operands are:
+// * accumulator
+// * first vector operand
+// * second vector operand
+// * mask (only in predicated versions)
+defm int_arm_mve_vabav: MVEPredicated<
+  [llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty,
+  [IntrNoMem]>;
+
+// The following 3 instrinsics are MVE vector reductions with two vector
+// operands.
+// The first 3 operands are boolean flags (must be compile-time constants):
+// * unsigned - the instruction operates on vectors of unsigned values and
+//              unsigned scalars
+// * subtract - the instruction performs subtraction after multiplication of
+//              lane pairs (e.g., vmlsdav vs vmladav)
+// * exchange - the instruction exchanges successive even and odd lanes of
+//              the first operands before multiplication of lane pairs
+//              (e.g., vmladavx vs vmladav)
+// The remaining operands are:
+// * accumulator
+// * first vector operand
+// * second vector operand
+// * mask (only in predicated versions)
+
+// Version with 32-bit result, vml{a,s}dav[a][x]
+defm int_arm_mve_vmldava: MVEPredicated<
+  [llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+   llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
+  llvm_anyvector_ty, [IntrNoMem]>;
+
+// Version with 64-bit result, vml{a,s}ldav[a][x]
+defm int_arm_mve_vmlldava: MVEPredicated<
+  [llvm_i32_ty, llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+   llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
+  llvm_anyvector_ty, [IntrNoMem]>;
+
+// Version with 72-bit rounded result, vrml{a,s}ldavh[a][x]
+defm int_arm_mve_vrmlldavha: MVEPredicated<
+  [llvm_i32_ty, llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+   llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
+  llvm_anyvector_ty, [IntrNoMem]>;
 } // end TargetPrefix

diff  --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6dd56b35d0ab..acbbf20f3ef9 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -237,6 +237,27 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
   void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
                          uint16_t OpcodeWithNoCarry, bool Add, bool Predicated);
 
+  /// Select long MVE vector reductions with two vector operands
+  /// Stride is the number of vector element widths the instruction can operate
+  /// on:
+  /// 2 for long non-rounding variants, vml{a,s}ldav[a][x]: [i16, i32]
+  /// 1 for long rounding variants: vrml{a,s}ldavh[a][x]: [i32]
+  /// Stride is used when addressing the OpcodesS array which contains multiple
+  /// opcodes for each element width.
+  /// TySize is the index into the list of element types listed above
+  void SelectBaseMVE_VMLLDAV(SDNode *N, bool Predicated,
+                             const uint16_t *OpcodesS, const uint16_t *OpcodesU,
+                             size_t Stride, size_t TySize);
+
+  /// Select a 64-bit MVE vector reduction with two vector operands
+  /// arm_mve_vmlldava_[predicated]
+  void SelectMVE_VMLLDAV(SDNode *N, bool Predicated, const uint16_t *OpcodesS,
+                         const uint16_t *OpcodesU);
+  /// Select a 72-bit MVE vector rounding reduction with two vector operands
+  /// int_arm_mve_vrmlldavha[_predicated]
+  void SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated, const uint16_t *OpcodesS,
+                           const uint16_t *OpcodesU);
+
   /// SelectMVE_VLD - Select MVE interleaving load intrinsics. NumVecs
   /// should be 2 or 4. The opcode array specifies the instructions
   /// used for 8, 16 and 32-bit lane sizes respectively, and each
@@ -2531,6 +2552,96 @@ void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
 }
 
+static bool SDValueToConstBool(SDValue SDVal) {
+  assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant");
+  ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal);
+  uint64_t Value = SDValConstant->getZExtValue();
+  assert((Value == 0 || Value == 1) && "expected value 0 or 1");
+  return Value;
+}
+
+void ARMDAGToDAGISel::SelectBaseMVE_VMLLDAV(SDNode *N, bool Predicated,
+                                            const uint16_t *OpcodesS,
+                                            const uint16_t *OpcodesU,
+                                            size_t Stride, size_t TySize) {
+  assert(TySize < Stride && "Invalid TySize");
+  bool IsUnsigned = SDValueToConstBool(N->getOperand(1));
+  bool IsSub = SDValueToConstBool(N->getOperand(2));
+  bool IsExchange = SDValueToConstBool(N->getOperand(3));
+  if (IsUnsigned) {
+    assert(!IsSub &&
+           "Unsigned versions of vmlsldav[a]/vrmlsldavh[a] do not exist");
+    assert(!IsExchange &&
+           "Unsigned versions of vmlaldav[a]x/vrmlaldavh[a]x do not exist");
+  }
+
+  auto OpIsZero = [N](size_t OpNo) {
+    if (ConstantSDNode *OpConst = dyn_cast<ConstantSDNode>(N->getOperand(OpNo)))
+      if (OpConst->getZExtValue() == 0)
+        return true;
+    return false;
+  };
+
+  // If the input accumulator value is not zero, select an instruction with
+  // accumulator, otherwise select an instruction without accumulator
+  bool IsAccum = !(OpIsZero(4) && OpIsZero(5));
+
+  const uint16_t *Opcodes = IsUnsigned ? OpcodesU : OpcodesS;
+  if (IsSub)
+    Opcodes += 4 * Stride;
+  if (IsExchange)
+    Opcodes += 2 * Stride;
+  if (IsAccum)
+    Opcodes += Stride;
+  uint16_t Opcode = Opcodes[TySize];
+
+  SDLoc Loc(N);
+  SmallVector<SDValue, 8> Ops;
+  // Push the accumulator operands, if they are used
+  if (IsAccum) {
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(N->getOperand(5));
+  }
+  // Push the two vector operands
+  Ops.push_back(N->getOperand(6));
+  Ops.push_back(N->getOperand(7));
+
+  if (Predicated)
+    AddMVEPredicateToOps(Ops, Loc, N->getOperand(8));
+  else
+    AddEmptyMVEPredicateToOps(Ops, Loc);
+
+  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
+void ARMDAGToDAGISel::SelectMVE_VMLLDAV(SDNode *N, bool Predicated,
+                                        const uint16_t *OpcodesS,
+                                        const uint16_t *OpcodesU) {
+  EVT VecTy = N->getOperand(6).getValueType();
+  size_t SizeIndex;
+  switch (VecTy.getVectorElementType().getSizeInBits()) {
+  case 16:
+    SizeIndex = 0;
+    break;
+  case 32:
+    SizeIndex = 1;
+    break;
+  default:
+    llvm_unreachable("bad vector element size");
+  }
+
+  SelectBaseMVE_VMLLDAV(N, Predicated, OpcodesS, OpcodesU, 2, SizeIndex);
+}
+
+void ARMDAGToDAGISel::SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated,
+                                          const uint16_t *OpcodesS,
+                                          const uint16_t *OpcodesU) {
+  EVT VecTy = N->getOperand(6).getValueType();
+  assert(VecTy.getVectorElementType().getSizeInBits() == 32 &&
+         "bad vector element size");
+  SelectBaseMVE_VMLLDAV(N, Predicated, OpcodesS, OpcodesU, 1, 0);
+}
+
 void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs,
                                     const uint16_t *const *Opcodes) {
   EVT VT = N->getValueType(0);
@@ -4376,6 +4487,42 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
                         IntNo == Intrinsic::arm_mve_vadc_predicated);
       return;
 
+    case Intrinsic::arm_mve_vmlldava:
+    case Intrinsic::arm_mve_vmlldava_predicated: {
+      static const uint16_t OpcodesU[] = {
+          ARM::MVE_VMLALDAVu16,   ARM::MVE_VMLALDAVu32,
+          ARM::MVE_VMLALDAVau16,  ARM::MVE_VMLALDAVau32,
+      };
+      static const uint16_t OpcodesS[] = {
+          ARM::MVE_VMLALDAVs16,   ARM::MVE_VMLALDAVs32,
+          ARM::MVE_VMLALDAVas16,  ARM::MVE_VMLALDAVas32,
+          ARM::MVE_VMLALDAVxs16,  ARM::MVE_VMLALDAVxs32,
+          ARM::MVE_VMLALDAVaxs16, ARM::MVE_VMLALDAVaxs32,
+          ARM::MVE_VMLSLDAVs16,   ARM::MVE_VMLSLDAVs32,
+          ARM::MVE_VMLSLDAVas16,  ARM::MVE_VMLSLDAVas32,
+          ARM::MVE_VMLSLDAVxs16,  ARM::MVE_VMLSLDAVxs32,
+          ARM::MVE_VMLSLDAVaxs16, ARM::MVE_VMLSLDAVaxs32,
+      };
+      SelectMVE_VMLLDAV(N, IntNo == Intrinsic::arm_mve_vmlldava_predicated,
+                        OpcodesS, OpcodesU);
+      return;
+    }
+
+    case Intrinsic::arm_mve_vrmlldavha:
+    case Intrinsic::arm_mve_vrmlldavha_predicated: {
+      static const uint16_t OpcodesU[] = {
+          ARM::MVE_VRMLALDAVHu32,  ARM::MVE_VRMLALDAVHau32,
+      };
+      static const uint16_t OpcodesS[] = {
+          ARM::MVE_VRMLALDAVHs32,  ARM::MVE_VRMLALDAVHas32,
+          ARM::MVE_VRMLALDAVHxs32, ARM::MVE_VRMLALDAVHaxs32,
+          ARM::MVE_VRMLSLDAVHs32,  ARM::MVE_VRMLSLDAVHas32,
+          ARM::MVE_VRMLSLDAVHxs32, ARM::MVE_VRMLSLDAVHaxs32,
+      };
+      SelectMVE_VRMLLDAVH(N, IntNo == Intrinsic::arm_mve_vrmlldavha_predicated,
+                          OpcodesS, OpcodesU);
+      return;
+    }
     }
     break;
   }

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 21f0d5e86790..a40231c4aa13 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -567,10 +567,10 @@ class MVE_rDest<dag oops, dag iops, InstrItinClass itin,
   let Inst{4} = 0b0;
 }
 
-class MVE_VABAV<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+class MVE_VABAV<string suffix, bit U, bits<2> size>
   : MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm),
               NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src",
-              pattern> {
+              []> {
   bits<4> Qm;
   bits<4> Qn;
   bits<4> Rda;
@@ -589,12 +589,36 @@ class MVE_VABAV<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
   let Inst{0} = 0b1;
 }
 
-def MVE_VABAVs8  : MVE_VABAV<"s8", 0b0, 0b00>;
-def MVE_VABAVs16 : MVE_VABAV<"s16", 0b0, 0b01>;
-def MVE_VABAVs32 : MVE_VABAV<"s32", 0b0, 0b10>;
-def MVE_VABAVu8  : MVE_VABAV<"u8", 0b1, 0b00>;
-def MVE_VABAVu16 : MVE_VABAV<"u16", 0b1, 0b01>;
-def MVE_VABAVu32 : MVE_VABAV<"u32", 0b1, 0b10>;
+multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> {
+  def "" : MVE_VABAV<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(i32 (int_arm_mve_vabav
+                            (i32 VTI.Unsigned),
+                            (i32 rGPR:$Rda_src),
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+              (i32 (!cast<Instruction>(NAME)
+                            (i32 rGPR:$Rda_src),
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
+
+    def : Pat<(i32 (int_arm_mve_vabav_predicated
+                            (i32 VTI.Unsigned),
+                            (i32 rGPR:$Rda_src),
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+                            (VTI.Pred VCCR:$mask))),
+              (i32 (!cast<Instruction>(NAME)
+                            (i32 rGPR:$Rda_src),
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+  }
+}
+
+defm MVE_VABAVs8  : MVE_VABAV_m<MVE_v16s8>;
+defm MVE_VABAVs16 : MVE_VABAV_m<MVE_v8s16>;
+defm MVE_VABAVs32 : MVE_VABAV_m<MVE_v4s32>;
+defm MVE_VABAVu8  : MVE_VABAV_m<MVE_v16u8>;
+defm MVE_VABAVu16 : MVE_VABAV_m<MVE_v8u16>;
+defm MVE_VABAVu32 : MVE_VABAV_m<MVE_v4u32>;
 
 class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
               bit A, bit U, bits<2> size, list<dag> pattern=[]>
@@ -803,10 +827,9 @@ defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>;
 defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>;
 
 class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
-                   bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
-                   list<dag> pattern=[]>
+                   bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
   : MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix,
-              "$RdaDest, $Qn, $Qm", cstr, pattern> {
+              "$RdaDest, $Qn, $Qm", cstr, []> {
   bits<4> RdaDest;
   bits<3> Qm;
   bits<3> Qn;
@@ -824,47 +847,88 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
   let Inst{0} = bit_0;
 }
 
-multiclass MVE_VMLAMLSDAV_A<string iname, string x, string suffix,
-                            bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
-                            list<dag> pattern=[]> {
-  def ""#x#suffix : MVE_VMLAMLSDAV<iname # x, suffix,
+multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
+                            bit sz, bit bit_28, bit X, bit bit_8, bit bit_0> {
+  def ""#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # x, VTI.Suffix,
                                    (ins MQPR:$Qn, MQPR:$Qm), "",
-                                   sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
-  def "a"#x#suffix : MVE_VMLAMLSDAV<iname # "a" # x, suffix,
+                                   sz, bit_28, 0b0, X, bit_8, bit_0>;
+  def "a"#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # "a" # x, VTI.Suffix,
                                     (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
                                     "$RdaDest = $RdaSrc",
-                                    sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
+                                    sz, bit_28, 0b1, X, bit_8, bit_0>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(i32 (int_arm_mve_vmldava
+                            (i32 VTI.Unsigned),
+                            (i32 bit_0) /* subtract */,
+                            (i32 X) /* exchange */,
+                            (i32 0) /* accumulator */,
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+              (i32 (!cast<Instruction>(NAME # x # VTI.Suffix)
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
+
+    def : Pat<(i32 (int_arm_mve_vmldava_predicated
+                            (i32 VTI.Unsigned),
+                            (i32 bit_0) /* subtract */,
+                            (i32 X) /* exchange */,
+                            (i32 0) /* accumulator */,
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+                            (VTI.Pred VCCR:$mask))),
+              (i32 (!cast<Instruction>(NAME # x # VTI.Suffix)
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+                             ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+
+    def : Pat<(i32 (int_arm_mve_vmldava
+                            (i32 VTI.Unsigned),
+                            (i32 bit_0) /* subtract */,
+                            (i32 X) /* exchange */,
+                            (i32 tGPREven:$RdaSrc),
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+              (i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix)
+                            (i32 tGPREven:$RdaSrc),
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
+
+    def : Pat<(i32 (int_arm_mve_vmldava_predicated
+                            (i32 VTI.Unsigned),
+                            (i32 bit_0) /* subtract */,
+                            (i32 X) /* exchange */,
+                            (i32 tGPREven:$RdaSrc),
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+                            (VTI.Pred VCCR:$mask))),
+              (i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix)
+                            (i32 tGPREven:$RdaSrc),
+                            (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+                             ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+  }
 }
 
-multiclass MVE_VMLAMLSDAV_AX<string iname, string suffix, bit sz, bit bit_28,
-                             bit bit_8, bit bit_0, list<dag> pattern=[]> {
-  defm "" : MVE_VMLAMLSDAV_A<iname, "", suffix, sz, bit_28,
-                             0b0, bit_8, bit_0, pattern>;
-  defm "" : MVE_VMLAMLSDAV_A<iname, "x", suffix, sz, bit_28,
-                             0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLAMLSDAV_AX<string iname, MVEVectorVTInfo VTI, bit sz,
+                             bit bit_28, bit bit_8, bit bit_0> {
+  defm "" : MVE_VMLAMLSDAV_A<iname, "", VTI, sz, bit_28,
+                             0b0, bit_8, bit_0>;
+  defm "" : MVE_VMLAMLSDAV_A<iname, "x", VTI, sz, bit_28,
+                             0b1, bit_8, bit_0>;
 }
 
-multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit bit_8,
-                             list<dag> pattern=[]> {
-  defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix,
-                              sz, 0b0, bit_8, 0b0, pattern>;
-  defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix,
-                             sz, 0b1, 0b0, bit_8, 0b0, pattern>;
+multiclass MVE_VMLADAV_multi<MVEVectorVTInfo SVTI, MVEVectorVTInfo UVTI,
+                             bit sz, bit bit_8> {
+  defm "" : MVE_VMLAMLSDAV_AX<"vmladav", SVTI,
+                              sz, 0b0, bit_8, 0b0>;
+  defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", UVTI,
+                             sz, 0b1, 0b0, bit_8, 0b0>;
 }
 
-multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
-                             list<dag> pattern=[]> {
-  defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix,
-                              sz, bit_28, 0b0, 0b1, pattern>;
+multiclass MVE_VMLSDAV_multi<MVEVectorVTInfo VTI, bit sz, bit bit_28> {
+  defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", VTI,
+                              sz, bit_28, 0b0, 0b1>;
 }
 
-defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>;
-defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>;
-defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v16s8, MVE_v16u8, 0b0, 0b1>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v8s16, MVE_v8u16, 0b0, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v4s32, MVE_v4u32, 0b1, 0b0>;
 
-defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>;
-defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>;
-defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v16s8, 0b0, 0b1>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v8s16, 0b0, 0b0>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v4s32, 0b1, 0b0>;
 
 // vmlav aliases vmladav
 foreach acc = ["", "a"] in {

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll
new file mode 100644
index 000000000000..f7fb69e8c311
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabavq.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare i32 @llvm.arm.mve.vabav.v16i8(i32, i32, <16 x i8>, <16 x i8>)
+declare i32 @llvm.arm.mve.vabav.v8i16(i32, i32, <8 x i16>, <8 x i16>)
+declare i32 @llvm.arm.mve.vabav.v4i32(i32, i32, <4 x i32>, <4 x i32>)
+
+declare i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32, i32, <16 x i8>, <16 x i8>, <16 x i1>)
+declare i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32, i32, <8 x i16>, <8 x i16>, <8 x i1>)
+declare i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32, i32, <4 x i32>, <4 x i32>, <4 x i1>)
+
+define arm_aapcs_vfpcc i32 @test_vabavq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vabavq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vabav.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vabav.v16i8(i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vabavq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vabav.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vabav.v8i16(i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vabavq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vabav.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vabav.v4i32(i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_u8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vabavq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vabav.u8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vabav.v16i8(i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_u16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vabavq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vabav.u16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vabav.v8i16(i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_u32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vabavq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vabav.u32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vabav.v4i32(i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabavt.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabavt.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabavt.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_u8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabavt.u8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vabav.predicated.v16i8.v16i1(i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_u16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabavt.u16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vabav.predicated.v8i16.v8i1(i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vabavq_p_u32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vabavq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabavt.u32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vabav.predicated.v4i32.v4i1(i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+  ret i32 %2
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll
new file mode 100644
index 000000000000..ae8f24386b9c
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmldav.ll
@@ -0,0 +1,734 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare i32 @llvm.arm.mve.vmldava.v16i8(i32, i32, i32, i32, <16 x i8>, <16 x i8>)
+declare i32 @llvm.arm.mve.vmldava.v8i16(i32, i32, i32, i32, <8 x i16>, <8 x i16>)
+declare i32 @llvm.arm.mve.vmldava.v4i32(i32, i32, i32, i32, <4 x i32>, <4 x i32>)
+
+declare i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32, i32, i32, i32, <16 x i8>, <16 x i8>, <16 x i1>)
+declare i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32, i32, i32, i32, <8 x i16>, <8 x i16>, <8 x i1>)
+declare i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>)
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmladavaq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlava.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmladavaq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlava.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmladavaq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlava.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_u8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmladavaq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlava.u8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_u16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmladavaq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlava.u16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_u32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmladavaq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlava.u32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmladavaxq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmladavax.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmladavaxq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmladavax.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmladavaxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmladavax.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlsdavaq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdava.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsdavaq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdava.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsdavaq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdava.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_s8(i32 %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlsdavaxq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdavax.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_s16(i32 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsdavaxq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdavax.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_s32(i32 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsdavaxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdavax.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavat.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavat.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavat.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_u8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavat.u8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_u16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavat.u16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaq_p_u32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaxq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmladavaxt.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaxq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmladavaxt.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavaxq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavaxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmladavaxt.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavat.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavat.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavat.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_p_s8(i32 %a, <16 x i8> %b, <16 x i8> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaxq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavaxt.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_p_s16(i32 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaxq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavaxt.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavaxq_p_s32(i32 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavaxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavaxt.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmladavq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlav.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmladavq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlav.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmladavq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlav.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmladavq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlav.u8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 1, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmladavq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlav.u16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmladavq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlav.u32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmladavxq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmladavx.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 0, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmladavxq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmladavx.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmladavxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmladavx.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmlsdavq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdav.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlsdavq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdav.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlsdavq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdav.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmlsdavxq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdavx.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v16i8(i32 0, i32 1, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlsdavxq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdavx.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 1, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlsdavxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsdavx.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 1, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavt.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavt.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavt.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavt.u8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 1, i32 0, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavq_p_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavxq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmladavxt.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavxq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmladavxt.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmladavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmladavxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmladavxt.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavt.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 0, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavt.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavt.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_p_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavxq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavxt.s8 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 1, i32 1, i32 0, <16 x i8> %a, <16 x i8> %b, <16 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavxq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavxt.s16 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmlsdavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsdavxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsdavxt.s32 r0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  ret i32 %2
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll
new file mode 100644
index 000000000000..9a215b369335
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmlldav.ll
@@ -0,0 +1,1183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>)
+declare { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>)
+declare { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>)
+
+declare { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>, <8 x i1>)
+declare { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>)
+declare { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32, i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>)
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlaldavaq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlalva.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlaldavaq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlalva.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_u16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlaldavaq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlalva.u16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_u32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlaldavaq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlalva.u32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlaldavaxq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlaldavax.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlaldavaxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlaldavax.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsldavaq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsldava.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsldavaq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsldava.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldaxvaq_s16(i64 %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsldaxvaq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsldavax.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsldavaxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsldavax.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlaldavhaq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlalvha.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_u32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlaldavhaq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlalvha.u32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlaldavhaxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlaldavhax.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhaq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlsldavhaq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlsldavha.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhaxq_s32(i64 %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vrmlsldavhaxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlsldavhax.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlalvat.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlalvat.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_u16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlalvat.u16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaq_p_u32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlalvat.u32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaxq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlaldavaxt.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavaxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlaldavaxt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavaq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsldavat.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavaq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsldavat.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldaxvaq_p_s16(i64 %a, <8 x i16> %b, <8 x i16> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldaxvaq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsldavaxt.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 %2, i32 %1, <8 x i16> %b, <8 x i16> %c, <8 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavaxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsldavaxt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhaq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlalvhat.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaq_p_u32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhaq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlalvhat.u32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhaxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlaldavhaxt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhaq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlsldavhaq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlsldavhat.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhaxq_p_s32(i64 %a, <4 x i32> %b, <4 x i32> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlsldavhaxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlsldavhaxt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %a, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %a to i32
+  %3 = zext i16 %p to i32
+  %4 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %3)
+  %5 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 %2, i32 %1, <4 x i32> %b, <4 x i32> %c, <4 x i1> %4)
+  %6 = extractvalue { i32, i32 } %5, 1
+  %7 = zext i32 %6 to i64
+  %8 = shl i64 %7, 32
+  %9 = extractvalue { i32, i32 } %5, 0
+  %10 = zext i32 %9 to i64
+  %11 = or i64 %8, %10
+  ret i64 %11
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlaldavq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlalv.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlaldavq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlalv.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlaldavq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlalv.u16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlaldavq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlalv.u32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavxq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlaldavxq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlaldavx.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlaldavxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlaldavx.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlsldavq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsldav.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlsldavq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsldav.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavxvq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlsldavxvq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsldavx.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlsldavxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmlsldavx.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vmlldava.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlaldavhq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlalvh.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlaldavhq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlalvh.u32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlaldavhxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlaldavhx.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlsldavhq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlsldavh.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhxq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrmlsldavhxq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmlsldavhx.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.v4i32(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlalvt.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlalvt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlalvt.u16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavq_p_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlalvt.u32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavxq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavxq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlaldavxt.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 0, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlaldavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlaldavxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlaldavxt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsldavt.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsldavt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldaxvq_p_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldaxvq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsldavxt.s16 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v8i16.v8i1(i32 0, i32 1, i32 1, i32 0, i32 0, <8 x i16> %a, <8 x i16> %b, <8 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vmlsldavxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmlsldavxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmlsldavxt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vmlldava.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlalvht.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhq_p_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlalvht.u32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlaldavhxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlaldavhxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlaldavhxt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 0, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlsldavhq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlsldavht.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define arm_aapcs_vfpcc i64 @test_vrmlsldavhxq_p_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vrmlsldavhxq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmlsldavhxt.s32 r0, r1, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call { i32, i32 } @llvm.arm.mve.vrmlldavha.predicated.v4i32.v4i1(i32 0, i32 1, i32 1, i32 0, i32 0, <4 x i32> %a, <4 x i32> %b, <4 x i1> %1)
+  %3 = extractvalue { i32, i32 } %2, 1
+  %4 = zext i32 %3 to i64
+  %5 = shl i64 %4, 32
+  %6 = extractvalue { i32, i32 } %2, 0
+  %7 = zext i32 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}