[llvm] 228c740 - [ARM][MVE][Intrinsics] Add _x() variants of my _m() intrinsics.

Fri Dec 13 03:51:38 PST 2019

Author: Mark Murray
Date: 2019-12-13T11:51:23Z
New Revision: 228c74076d5168a35274b04505f99e373f74f65f

URL: https://github.com/llvm/llvm-project/commit/228c74076d5168a35274b04505f99e373f74f65f
DIFF: https://github.com/llvm/llvm-project/commit/228c74076d5168a35274b04505f99e373f74f65f.diff

LOG: [ARM][MVE][Intrinsics] Add *_x() variants of my *_m() intrinsics.

Summary:
Better use of multiclass is used, and this helped find some existing
bugs in the predicated VMULL* intrinsics, which are now fixed.

The refactored VMULL[TB]Q_(INT|POLY)_M() intrinsics were discovered
to have an argument ("inactive") with incorrect type, and this required
a fix that is included in this whole patch. The argument "inactive"
should have been the same width (per vector element) as the return
type of the intrinsic, but was not in the case where the return type
was double the element width of the input types.

To assist in testing the multiclassing , and to thwart further gremlins,
the unit tests are improved in scope.

The *.ll tests are all generated by a small bit of throw-away scripting
from the corresponding *.c tests, and as such the diffs are large and
nasty. Look at the file rather than the diff.

Reviewers: dmgreen, miyuki, ostannard, simon_tatham

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D71421

Added: 
    clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll

Modified: 
    clang/include/clang/Basic/arm_mve.td
    clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
    clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
    clang/test/CodeGen/arm-mve-intrinsics/vandq.c
    clang/test/CodeGen/arm-mve-intrinsics/vbicq.c
    clang/test/CodeGen/arm-mve-intrinsics/veorq.c
    clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
    clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
    clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
    clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
    clang/test/CodeGen/arm-mve-intrinsics/vminq.c
    clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
    clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
    clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
    clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
    clang/test/CodeGen/arm-mve-intrinsics/vornq.c
    clang/test/CodeGen/arm-mve-intrinsics/vorrq.c
    clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
    clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
    llvm/include/llvm/IR/IntrinsicsARM.td
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vandq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vbicq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/veorq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vornq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vorrq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 7ed3c04c58db..8bb567c573a3 100644

--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -135,91 +135,70 @@ def vabdq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
                              (IRInt<"vabd", [Vector]> $a, $b)>;
 }
 
+multiclass VectorVectorArithmetic<string operation> {
+  defm "" : IntrinsicMX<Vector, (args Vector:$a, Vector:$b,
+                                 Predicate:$pred),
+                                (IRInt<operation, [Vector, Predicate]> $a, $b,
+                                 $pred, $inactive)>;
+}
+
+multiclass VectorVectorArithmeticBitcast<string operation> {
+  defm "" : IntrinsicMX<Vector, (args Vector:$a, Vector:$b,
+                                 Predicate:$pred),
+                                (bitcast (IRInt<operation, [UVector, Predicate]>
+                                          (bitcast $a, UVector),
+                                          (bitcast $b, UVector),
+                                          $pred,
+                                          (bitcast $inactive, UVector)), Vector)>;
+}
+
 // Predicated intrinsics
 let params = T.Usual in {
-def vabdq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"abd_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vaddq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"add_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vsubq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"sub_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vmulq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"mul_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-defm vandq_m: predicated_bit_op_fp<"and_predicated">;
-defm vbicq_m: predicated_bit_op_fp<"bic_predicated">;
-defm veorq_m: predicated_bit_op_fp<"eor_predicated">;
-defm vornq_m: predicated_bit_op_fp<"orn_predicated">;
-defm vorrq_m: predicated_bit_op_fp<"orr_predicated">;
+  defm vabdq : VectorVectorArithmetic<"abd_predicated">;
+  defm vaddq : VectorVectorArithmetic<"add_predicated">;
+  defm vsubq : VectorVectorArithmetic<"sub_predicated">;
+  defm vmulq : VectorVectorArithmetic<"mul_predicated">;
+  defm vandq : VectorVectorArithmeticBitcast<"and_predicated">;
+  defm vbicq : VectorVectorArithmeticBitcast<"bic_predicated">;
+  defm veorq : VectorVectorArithmeticBitcast<"eor_predicated">;
+  defm vornq : VectorVectorArithmeticBitcast<"orn_predicated">;
+  defm vorrq : VectorVectorArithmeticBitcast<"orr_predicated">;
+}
+
+multiclass DblVectorVectorArithmetic<string operation, dag top> {
+  defm "" : IntrinsicMX<DblVector, (args Vector:$a, Vector:$b,
+                                    Predicate:$pred),
+                                   (IRInt<operation,
+                                          [DblVector, Vector, Predicate]>
+                                    $a, $b, top, $pred, $inactive)>;
 }
 
 // Predicated intrinsics - Int types only
 let params = T.Int in {
-def vminq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"min_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vmaxq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"max_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vmulhq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"mulh_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vrmulhq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"rmulh_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vqdmulhq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"qdmulh_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vqrdmulhq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"qrdmulh_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vmullbq_int_m: Intrinsic<
-    DblVector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"mull_int_predicated", [DblVector, Vector, Predicate]> $a, $b, 0,
-     $pred, $inactive)>;
-def vmulltq_int_m: Intrinsic<
-    DblVector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"mull_int_predicated", [DblVector, Vector, Predicate]> $a, $b, 1,
-     $pred, $inactive)>;
-def vqaddq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"qadd_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vhaddq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"hadd_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vrhaddq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"rhadd_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vqsubq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"qsub_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vhsubq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"hsub_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
+  defm vminq : VectorVectorArithmetic<"min_predicated">;
+  defm vmaxq : VectorVectorArithmetic<"max_predicated">;
+  defm vmulhq : VectorVectorArithmetic<"mulh_predicated">;
+  defm vrmulhq : VectorVectorArithmetic<"rmulh_predicated">;
+  defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated">;
+  defm vqrdmulhq : VectorVectorArithmetic<"qrdmulh_predicated">;
+  defm vqaddq : VectorVectorArithmetic<"qadd_predicated">;
+  defm vhaddq : VectorVectorArithmetic<"hadd_predicated">;
+  defm vrhaddq : VectorVectorArithmetic<"rhadd_predicated">;
+  defm vqsubq : VectorVectorArithmetic<"qsub_predicated">;
+  defm vhsubq : VectorVectorArithmetic<"hsub_predicated">;
+  defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (u32 0)>;
+  defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (u32 1)>;
 }
 
 let params = T.Poly, overrideKindLetter = "p" in {
-def vmullbq_poly_m: Intrinsic<
-    DblVector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"mull_poly_predicated", [DblVector, Vector, Predicate]> $a, $b, 0,
-     $pred, $inactive)>;
-def vmulltq_poly_m: Intrinsic<
-    DblVector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"mull_poly_predicated", [DblVector, Vector, Predicate]> $a, $b, 1,
-     $pred, $inactive)>;
+  defm vmullbq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (u32 0)>;
+  defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (u32 1)>;
 }
 
 // Predicated intrinsics - Float types only
 let params = T.Float in {
-def vminnmq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"min_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
-def vmaxnmq_m: Intrinsic<
-    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
-    (IRInt<"max_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
+  defm vminnmq : VectorVectorArithmetic<"min_predicated">;
+  defm vmaxnmq : VectorVectorArithmetic<"max_predicated">;
 }
 
 let params = T.Int in {

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
index a416bfb773e6..fb8a5fb5e562 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
@@ -93,3 +93,51 @@ float32x4_t test_vabdq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b,
     return vabdq_m_f32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vabdq_x_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vabdq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vabdq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vabdq_x_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vabdq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vabdq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vabdq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vabdq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vabdq_x_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vabdq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vabdq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vabdq_x_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
index 0f4402c9ea53..1810c7e89b54 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
@@ -18,17 +18,17 @@ uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_f16(
+// CHECK-LABEL: @test_vaddq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fsub <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fadd <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
-float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b)
+float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b)
 {
 #ifdef POLYMORPHIC
-    return vsubq(a, b);
+    return vaddq(a, b);
 #else /* POLYMORPHIC */
-    return vsubq_f16(a, b);
+    return vaddq_f16(a, b);
 #endif /* POLYMORPHIC */
 }
 
@@ -48,18 +48,50 @@ int8x16_t test_vaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_m_f32(
+// CHECK-LABEL: @test_vaddq_m_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
-float32x4_t test_vsubq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
+float32x4_t test_vaddq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vaddq_m_f32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_x_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vaddq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vaddq_x_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_x_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vaddq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
-    return vsubq_m(inactive, a, b, p);
+    return vaddq_x(a, b, p);
 #else /* POLYMORPHIC */
-    return vsubq_m_f32(inactive, a, b, p);
+    return vaddq_x_f16(a, b, p);
 #endif /* POLYMORPHIC */
 }

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vandq.c b/clang/test/CodeGen/arm-mve-intrinsics/vandq.c
index aeab8b7063ec..9c88b67085fe 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vandq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vandq.c
@@ -4,6 +4,34 @@
 
 #include <arm_mve.h>
 
+// CHECK-LABEL: @test_vandq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vandq(a, b);
+#else /* POLYMORPHIC */
+    return vandq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vandq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vandq(a, b);
+#else /* POLYMORPHIC */
+    return vandq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_vandq_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]]
@@ -51,6 +79,38 @@ int8x16_t test_vandq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 #endif /* POLYMORPHIC */
 }
 
+// CHECK-LABEL: @test_vandq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.and.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vandq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vandq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vandq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vandq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vandq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vandq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vandq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_vandq_m_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
@@ -70,3 +130,70 @@ float16x8_t test_vandq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b,
     return vandq_m_f16(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vandq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.and.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vandq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vandq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vandq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vandq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.and.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vandq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vandq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vandq_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vandq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vandq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vandq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vandq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vandq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i1> [[TMP3]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float32x4_t test_vandq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vandq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vandq_x_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c b/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c
index 3106b40a322d..22ff5d5e739c 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c
@@ -4,6 +4,36 @@
 
 #include <arm_mve.h>
 
+// CHECK-LABEL: @test_vbicq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[B:%.*]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[A:%.*]], [[TMP0]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vbicq(a, b);
+#else /* POLYMORPHIC */
+    return vbicq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[B:%.*]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i16> [[A:%.*]], [[TMP0]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vbicq(a, b);
+#else /* POLYMORPHIC */
+    return vbicq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_vbicq_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -53,6 +83,38 @@ int8x16_t test_vbicq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 #endif /* POLYMORPHIC */
 }
 
+// CHECK-LABEL: @test_vbicq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.bic.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vbicq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vbicq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vbicq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vbicq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_vbicq_m_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
@@ -72,3 +134,70 @@ float16x8_t test_vbicq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b,
     return vbicq_m_f16(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vbicq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.bic.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vbicq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vbicq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.bic.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vbicq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vbicq_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vbicq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vbicq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i1> [[TMP3]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float32x4_t test_vbicq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vbicq_x_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/veorq.c b/clang/test/CodeGen/arm-mve-intrinsics/veorq.c
index c271568f791f..9025da9feefd 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/veorq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/veorq.c
@@ -4,6 +4,34 @@
 
 #include <arm_mve.h>
 
+// CHECK-LABEL: @test_veorq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return veorq(a, b);
+#else /* POLYMORPHIC */
+    return veorq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_veorq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return veorq(a, b);
+#else /* POLYMORPHIC */
+    return veorq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_veorq_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[A:%.*]], [[B:%.*]]
@@ -51,6 +79,38 @@ int8x16_t test_veorq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 #endif /* POLYMORPHIC */
 }
 
+// CHECK-LABEL: @test_veorq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.eor.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_veorq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return veorq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return veorq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_veorq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_veorq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return veorq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return veorq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_veorq_m_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
@@ -70,3 +130,70 @@ float16x8_t test_veorq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b,
     return veorq_m_f16(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_veorq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.eor.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_veorq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return veorq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return veorq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_veorq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.eor.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_veorq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return veorq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return veorq_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_veorq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_veorq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return veorq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return veorq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_veorq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i1> [[TMP3]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float32x4_t test_veorq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return veorq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return veorq_x_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
index 5eb2cc8e1844..a31e6ae4ffa1 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
@@ -93,3 +93,51 @@ int32x4_t test_vhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
     return vhaddq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vhaddq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vhaddq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vhaddq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vhaddq_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vhaddq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
index 63300466c819..3f44b654c488 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
@@ -63,3 +63,35 @@ float32x4_t test_vmaxnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t
     return vmaxnmq_m_f32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vmaxnmq_x_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vmaxnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxnmq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxnmq_x_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmq_x_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vmaxnmq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxnmq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxnmq_x_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
index 133e28d6cf04..7b766fb0d257 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
@@ -96,3 +96,51 @@ uint32x4_t test_vmaxq_m_u32(uint32x4_t inactive, uint32x4_t a, uint32x4_t b, mve
     return vmaxq_m_u32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vmaxq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vmaxq_x_u8(int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxq_x_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxq_x_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmaxq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxq_x_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxq_x_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vmaxq_x_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
index 9ed5bf0c859b..ed6b31dfb0f3 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
@@ -63,3 +63,35 @@ float32x4_t test_vminnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t
     return vminnmq_m_f32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vminnmq_x_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vminnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminnmq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vminnmq_x_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmq_x_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vminnmq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminnmq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vminnmq_x_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
index 9e54eaeb5d83..b6981104d8b4 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
@@ -96,3 +96,51 @@ int32x4_t test_vminq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pre
     return vminq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vminq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vminq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vminq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vminq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vminq_x_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminq_x_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vminq_x_s32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vminq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
index 63696d698c50..3a6b89504dd4 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
@@ -93,3 +93,51 @@ int32x4_t test_vmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
     return vmulhq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vmulhq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulhq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulhq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulhq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulhq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulhq_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulhq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vmulhq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulhq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulhq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
index 124e20507c8f..008a368912c2 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
@@ -64,10 +64,10 @@ uint32x4_t test_vmullbq_poly_p16(uint16x8_t a, uint16x8_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
-int16x8_t test_vmullbq_int_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+int16x8_t test_vmullbq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmullbq_int_m(inactive, a, b, p);
@@ -80,10 +80,10 @@ int16x8_t test_vmullbq_int_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mv
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
-uint32x4_t test_vmullbq_int_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+uint32x4_t test_vmullbq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmullbq_int_m(inactive, a, b, p);
@@ -96,10 +96,10 @@ uint32x4_t test_vmullbq_int_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 //
-int64x2_t test_vmullbq_int_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+int64x2_t test_vmullbq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmullbq_int_m(inactive, a, b, p);
@@ -112,10 +112,10 @@ int64x2_t test_vmullbq_int_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, m
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
-uint16x8_t test_vmullbq_poly_m_p8(uint8x16_t inactive, uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+uint16x8_t test_vmullbq_poly_m_p8(uint16x8_t inactive, uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmullbq_poly_m(inactive, a, b, p);
@@ -123,3 +123,67 @@ uint16x8_t test_vmullbq_poly_m_p8(uint8x16_t inactive, uint8x16_t a, uint8x16_t
     return vmullbq_poly_m_p8(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vmullbq_int_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmullbq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmullbq_int_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmullbq_int_x_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmullbq_int_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vmullbq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmullbq_int_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmullbq_int_x_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmullbq_int_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> undef)
+// CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+//
+uint64x2_t test_vmullbq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmullbq_int_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmullbq_int_x_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmullbq_poly_x_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.poly.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vmullbq_poly_x_p16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmullbq_poly_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmullbq_poly_x_p16(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
index 865970305915..7baa9944ba1c 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
@@ -64,10 +64,10 @@ uint32x4_t test_vmulltq_poly_p16(uint16x8_t a, uint16x8_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
-int16x8_t test_vmulltq_int_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+int16x8_t test_vmulltq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmulltq_int_m(inactive, a, b, p);
@@ -80,10 +80,10 @@ int16x8_t test_vmulltq_int_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mv
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
-uint32x4_t test_vmulltq_int_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+uint32x4_t test_vmulltq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmulltq_int_m(inactive, a, b, p);
@@ -96,10 +96,10 @@ uint32x4_t test_vmulltq_int_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 //
-int64x2_t test_vmulltq_int_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+int64x2_t test_vmulltq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmulltq_int_m(inactive, a, b, p);
@@ -112,10 +112,10 @@ int64x2_t test_vmulltq_int_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, m
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
-uint16x8_t test_vmulltq_poly_m_p8(uint8x16_t inactive, uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+uint16x8_t test_vmulltq_poly_m_p8(uint16x8_t inactive, uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmulltq_poly_m(inactive, a, b, p);
@@ -123,3 +123,67 @@ uint16x8_t test_vmulltq_poly_m_p8(uint8x16_t inactive, uint8x16_t a, uint8x16_t
     return vmulltq_poly_m_p8(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vmulltq_int_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmulltq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulltq_int_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulltq_int_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulltq_int_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vmulltq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulltq_int_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulltq_int_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulltq_int_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> undef)
+// CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+//
+uint64x2_t test_vmulltq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulltq_int_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulltq_int_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulltq_poly_x_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmulltq_poly_x_p8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulltq_poly_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulltq_poly_x_p8(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
index ac457cba81eb..536bc7322fc2 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
@@ -123,3 +123,67 @@ float16x8_t test_vmulq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b,
     return vmulq_m_f16(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vmulq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vmulq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmulq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vmulq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vmulq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_x_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vornq.c b/clang/test/CodeGen/arm-mve-intrinsics/vornq.c
index 753a6ddf2ee1..8505bfed6047 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vornq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vornq.c
@@ -4,6 +4,36 @@
 
 #include <arm_mve.h>
 
+// CHECK-LABEL: @test_vornq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[B:%.*]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[TMP1:%.*]] = or <16 x i8> [[A:%.*]], [[TMP0]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vornq(a, b);
+#else /* POLYMORPHIC */
+    return vornq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vornq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[B:%.*]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i16> [[A:%.*]], [[TMP0]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vornq(a, b);
+#else /* POLYMORPHIC */
+    return vornq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_vornq_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -53,6 +83,38 @@ int8x16_t test_vornq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 #endif /* POLYMORPHIC */
 }
 
+// CHECK-LABEL: @test_vornq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.orn.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vornq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vornq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vornq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vornq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vornq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vornq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vornq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_vornq_m_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
@@ -72,3 +134,70 @@ float16x8_t test_vornq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b,
     return vornq_m_f16(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vornq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.orn.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vornq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vornq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vornq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vornq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.orn.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vornq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vornq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vornq_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vornq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vornq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vornq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vornq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vornq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i1> [[TMP3]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float32x4_t test_vornq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vornq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vornq_x_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c b/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c
index 436f6277e073..3ddc8629dd4e 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c
@@ -4,6 +4,34 @@
 
 #include <arm_mve.h>
 
+// CHECK-LABEL: @test_vorrq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = or <16 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vorrq(a, b);
+#else /* POLYMORPHIC */
+    return vorrq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i16> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vorrq(a, b);
+#else /* POLYMORPHIC */
+    return vorrq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_vorrq_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = or <4 x i32> [[A:%.*]], [[B:%.*]]
@@ -51,6 +79,38 @@ int8x16_t test_vorrq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 #endif /* POLYMORPHIC */
 }
 
+// CHECK-LABEL: @test_vorrq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.orr.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vorrq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vorrq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vorrq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vorrq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
 // CHECK-LABEL: @test_vorrq_m_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
@@ -70,3 +130,70 @@ float16x8_t test_vorrq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b,
     return vorrq_m_f16(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vorrq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vorrq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vorrq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.orr.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vorrq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vorrq_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vorrq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vorrq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i1> [[TMP3]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float32x4_t test_vorrq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vorrq_x_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
index b5cff396f528..eb34fec9c13a 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
@@ -93,3 +93,51 @@ int32x4_t test_vrhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_p
     return vrhaddq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vrhaddq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vrhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vrhaddq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrhaddq_x_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vrhaddq_x_u16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vrhaddq_x_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrhaddq_x_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vrhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vrhaddq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
index 2c8148405585..3bf77f0d16c7 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
@@ -93,3 +93,51 @@ int32x4_t test_vrmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_p
     return vrmulhq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vrmulhq_x_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vrmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrmulhq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vrmulhq_x_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrmulhq_x_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vrmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrmulhq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vrmulhq_x_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrmulhq_m_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vrmulhq_m_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrmulhq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vrmulhq_x_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
new file mode 100644
index 000000000000..dd11aa848bcc
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vsubq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = sub <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vsubq(a, b);
+#else /* POLYMORPHIC */
+    return vsubq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fsub <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
+float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vsubq(a, b);
+#else /* POLYMORPHIC */
+    return vsubq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vsubq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vsubq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vsubq_m_f32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_x_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vsubq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vsubq_x_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_x_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vsubq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vsubq_x_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 7861ce464f29..8995f969e2ea 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -848,11 +848,11 @@ def int_arm_mve_qrdmulh_predicated: Intrinsic<[llvm_anyvector_ty],
    [IntrNoMem]>;
 def int_arm_mve_mull_int_predicated: Intrinsic<[llvm_anyvector_ty],
    [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty,
-    LLVMMatchType<1>],
+    LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_mull_poly_predicated: Intrinsic<[llvm_anyvector_ty],
    [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty,
-    LLVMMatchType<1>],
+    LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_qadd_predicated: Intrinsic<[llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 2a063e4ebdec..a1a8614e4a7b 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -3831,11 +3831,11 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
     // Predicated multiply
     def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             (i32 Top),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+                            (VTI.Pred VCCR:$mask), (VTI.DblVec MQPR:$inactive))),
               (VTI.DblVec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
-                            (VTI.Vec MQPR:$inactive)))>;
+                            (VTI.DblVec MQPR:$inactive)))>;
   }
 }
 

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
index bafff00ea1de..fcd57664a16f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
@@ -1,31 +1,61 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
-define arm_aapcs_vfpcc <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) {
+define arm_aapcs_vfpcc <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vabdq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vabd.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %0
+}
+
+declare <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8>, <16 x i8>) #1
+
+define arm_aapcs_vfpcc <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vabd.s32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>%a, <4 x i32>%b)
+  %0 = tail call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %0
 }
 
-declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>) #1
 
-define arm_aapcs_vfpcc <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) {
+define arm_aapcs_vfpcc <8 x half> @test_vabdq_f32(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vabd.f32 q0, q0, q1
+; CHECK-NEXT:    vabd.f16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x float> @llvm.arm.mve.vabd.v4f32(<4 x float>%a, <4 x float>%b)
-  ret <4 x float> %0
+  %0 = tail call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %0
 }
 
-declare <4 x float> @llvm.arm.mve.vabd.v4f32(<4 x float>, <4 x float>)
+declare <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half>, <8 x half>) #1
 
-define arm_aapcs_vfpcc <16 x i8> @test_vabdq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+define arm_aapcs_vfpcc <8 x i16> @test_vabdq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vabdq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabdt.s16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
+
+declare <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vabdq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_m_s8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -39,24 +69,71 @@ entry:
   ret <16 x i8> %2
 }
 
-declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
+
+declare <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+
+define arm_aapcs_vfpcc <4 x float> @test_vabdq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vabdq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabdt.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+  ret <4 x float> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
+
+declare <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+
+define arm_aapcs_vfpcc <8 x i16> @test_vabdq_x_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vabdq_x_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabdt.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vabdq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vabdq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabdt.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
 
-declare <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)
+declare <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
 
-define arm_aapcs_vfpcc <8 x half> @test_vabdq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) {
-; CHECK-LABEL: test_vabdq_m_f16:
+define arm_aapcs_vfpcc <8 x half> @test_vabdq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vabdq_x_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vabdt.f16 q0, q1, q2
+; CHECK-NEXT:    vabdt.f16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+  %2 = tail call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
   ret <8 x half> %2
 }
 
-declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #1
 
-declare <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
index a3cb91c21bc8..b7becf6dd2c2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
-define arm_aapcs_vfpcc <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) {
+define arm_aapcs_vfpcc <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vaddq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.i32 q0, q1, q0
@@ -11,37 +11,17 @@ entry:
   ret <4 x i32> %0
 }
 
-define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: test_vaddq_f32:
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vaddq_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 q0, q1, q0
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = fadd <4 x float> %b, %a
-  ret <4 x float> %0
-}
-
-define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16(<8 x half> %a, <8 x half> %b) {
-; CHECK-LABEL: test_vsubq_f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vsub.f16 q0, q0, q1
-; CHECK-NEXT:    bx lr
-entry:
-  %0 = fsub <8 x half> %a, %b
+  %0 = fadd <8 x half> %a, %b
   ret <8 x half> %0
 }
 
-define arm_aapcs_vfpcc <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubq_s16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vsub.i16 q0, q0, q1
-; CHECK-NEXT:    bx lr
-entry:
-  %0 = sub <8 x i16> %a, %b
-  ret <8 x i16> %0
-}
-
-define arm_aapcs_vfpcc <16 x i8> @test_vaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+define arm_aapcs_vfpcc <16 x i8> @test_vaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vaddq_m_s8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -55,58 +35,59 @@ entry:
   ret <16 x i8> %2
 }
 
-declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
 
-declare <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)
+declare <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
 
-define arm_aapcs_vfpcc <8 x half> @test_vaddq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) {
-; CHECK-LABEL: test_vaddq_m_f16:
+define arm_aapcs_vfpcc <4 x float> @test_vaddq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vaddq_m_f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vaddt.f16 q0, q1, q2
+; CHECK-NEXT:    vaddt.f32 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
-  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
-  ret <8 x half> %2
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+  ret <4 x float> %2
 }
 
-declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
-declare <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>)
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
 
-define arm_aapcs_vfpcc <4 x float> @test_vsubq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) {
-; CHECK-LABEL: test_vsubq_m_f32:
+define arm_aapcs_vfpcc <8 x i16> @test_vaddq_x_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vaddq_x_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vsubt.f32 q0, q1, q2
+; CHECK-NEXT:    vaddt.i16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
-  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
-  ret <4 x float> %2
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
 }
 
-declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
-declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
 
-define arm_aapcs_vfpcc <4 x i32> @test_vsubq_m_u32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
-; CHECK-LABEL: test_vsubq_m_u32:
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vaddq_x_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vsubt.i32 q0, q1, q2
+; CHECK-NEXT:    vaddt.f16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
-  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
-  ret <4 x i32> %2
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+  ret <8 x half> %2
 }
 
-declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
+declare <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vandq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vandq.ll
index 1b1d498bc378..651ac17234b8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vandq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vandq.ll
@@ -11,24 +11,24 @@ entry:
   ret <16 x i8> %0
 }
 
-define arm_aapcs_vfpcc <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vandq_u32:
+define arm_aapcs_vfpcc <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vandq_s16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = and <4 x i32> %b, %a
-  ret <4 x i32> %0
+  %0 = and <8 x i16> %b, %a
+  ret <8 x i16> %0
 }
 
-define arm_aapcs_vfpcc <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vandq_s16:
+define arm_aapcs_vfpcc <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vandq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = and <8 x i16> %b, %a
-  ret <8 x i16> %0
+  %0 = and <4 x i32> %b, %a
+  ret <4 x i32> %0
 }
 
 define arm_aapcs_vfpcc <4 x float> @test_vandq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
@@ -80,25 +80,98 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
 declare <8 x i16> @llvm.arm.mve.and.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
 
-; Function Attrs: nounwind readnone
-define arm_aapcs_vfpcc <8 x half> @test_vandq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
-; CHECK-LABEL: test_vandq_m_f32:
+define arm_aapcs_vfpcc <4 x i32> @test_vandq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vandq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <8 x half> @test_vandq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vandq_m_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vandt q0, q1, q2
 ; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %1 = bitcast <8 x half> %b to <8 x i16>
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = bitcast <8 x half> %inactive to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.arm.mve.and.predicated.v8i16.v8i1(<8 x i16> %0, <8 x i16> %1, <8 x i1> %3, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vandq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vandq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.and.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vandq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vandq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.and.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vandq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vandq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vandq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vandq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q0, q1
+; CHECK-NEXT:    bx lr
 entry:
   %0 = bitcast <4 x float> %a to <4 x i32>
   %1 = bitcast <4 x float> %b to <4 x i32>
   %2 = zext i16 %p to i32
   %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
-  %4 = bitcast <4 x float> %inactive to <4 x i32>
-  %5 = tail call <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
-  %6 = bitcast <4 x i32> %5 to <8 x half>
-  ret <8 x half> %6
+  %4 = tail call <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> undef)
+  %5 = bitcast <4 x i32> %4 to <4 x float>
+  ret <4 x float> %5
 }
 
-declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-
-declare <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vbicq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vbicq.ll
index 47877a13cb96..97b7ac9fc34e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vbicq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vbicq.ll
@@ -12,26 +12,26 @@ entry:
   ret <16 x i8> %1
 }
 
-define arm_aapcs_vfpcc <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vbicq_u32:
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vbicq_s16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vbic q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
-  %1 = and <4 x i32> %0, %a
-  ret <4 x i32> %1
+  %0 = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %1 = and <8 x i16> %0, %a
+  ret <8 x i16> %1
 }
 
-define arm_aapcs_vfpcc <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vbicq_s16:
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vbicq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vbic q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-  %1 = and <8 x i16> %0, %a
-  ret <8 x i16> %1
+  %0 = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %1 = and <4 x i32> %0, %a
+  ret <4 x i32> %1
 }
 
 define arm_aapcs_vfpcc <4 x float> @test_vbicq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
@@ -84,25 +84,98 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
 declare <8 x i16> @llvm.arm.mve.bic.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
 
-; Function Attrs: nounwind readnone
-define arm_aapcs_vfpcc <8 x half> @test_vbicq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
-; CHECK-LABEL: test_vbicq_m_f32:
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vbicq_m_s32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vbict q0, q1, q2
 ; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <8 x half> @test_vbicq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vbicq_m_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %1 = bitcast <8 x half> %b to <8 x i16>
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = bitcast <8 x half> %inactive to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.arm.mve.bic.predicated.v8i16.v8i1(<8 x i16> %0, <8 x i16> %1, <8 x i1> %3, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vbicq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vbicq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.bic.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vbicq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.bic.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vbicq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vbicq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vbicq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict q0, q0, q1
+; CHECK-NEXT:    bx lr
 entry:
   %0 = bitcast <4 x float> %a to <4 x i32>
   %1 = bitcast <4 x float> %b to <4 x i32>
   %2 = zext i16 %p to i32
   %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
-  %4 = bitcast <4 x float> %inactive to <4 x i32>
-  %5 = tail call <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
-  %6 = bitcast <4 x i32> %5 to <8 x half>
-  ret <8 x half> %6
+  %4 = tail call <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> undef)
+  %5 = bitcast <4 x i32> %4 to <4 x float>
+  ret <4 x float> %5
 }
 
-declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-
-declare <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/veorq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/veorq.ll
index 9b66f3656eb2..6f70cafb013d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/veorq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/veorq.ll
@@ -11,24 +11,24 @@ entry:
   ret <16 x i8> %0
 }
 
-define arm_aapcs_vfpcc <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_veorq_u32:
+define arm_aapcs_vfpcc <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_veorq_s16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = xor <4 x i32> %b, %a
-  ret <4 x i32> %0
+  %0 = xor <8 x i16> %b, %a
+  ret <8 x i16> %0
 }
 
-define arm_aapcs_vfpcc <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_veorq_s16:
+define arm_aapcs_vfpcc <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_veorq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = xor <8 x i16> %b, %a
-  ret <8 x i16> %0
+  %0 = xor <4 x i32> %b, %a
+  ret <4 x i32> %0
 }
 
 define arm_aapcs_vfpcc <4 x float> @test_veorq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
@@ -80,25 +80,98 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
 declare <8 x i16> @llvm.arm.mve.eor.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
 
-; Function Attrs: nounwind readnone
-define arm_aapcs_vfpcc <8 x half> @test_veorq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
-; CHECK-LABEL: test_veorq_m_f32:
+define arm_aapcs_vfpcc <4 x i32> @test_veorq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_veorq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <8 x half> @test_veorq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_veorq_m_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    veort q0, q1, q2
 ; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %1 = bitcast <8 x half> %b to <8 x i16>
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = bitcast <8 x half> %inactive to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.arm.mve.eor.predicated.v8i16.v8i1(<8 x i16> %0, <8 x i16> %1, <8 x i1> %3, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_veorq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_veorq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.eor.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_veorq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_veorq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.eor.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_veorq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_veorq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_veorq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_veorq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q0, q1
+; CHECK-NEXT:    bx lr
 entry:
   %0 = bitcast <4 x float> %a to <4 x i32>
   %1 = bitcast <4 x float> %b to <4 x i32>
   %2 = zext i16 %p to i32
   %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
-  %4 = bitcast <4 x float> %inactive to <4 x i32>
-  %5 = tail call <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
-  %6 = bitcast <4 x i32> %5 to <8 x half>
-  ret <8 x half> %6
+  %4 = tail call <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> undef)
+  %5 = bitcast <4 x i32> %4 to <4 x float>
+  ret <4 x float> %5
 }
 
-declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-
-declare <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
index 11b211424272..82ecc062c086 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
@@ -90,3 +90,46 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
 declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vhaddq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhaddt.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vhaddq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhaddt.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vhaddq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhaddt.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
index d89308bb5941..54a140042bfc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
-define dso_local arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmaxnmq_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmaxnm.f16 q0, q0, q1
@@ -13,7 +13,7 @@ entry:
 
 declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>) #1
 
-define dso_local arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmaxnmq_f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmaxnm.f32 q0, q0, q1
@@ -25,7 +25,7 @@ entry:
 
 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1
 
-define dso_local arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmaxnmq_m_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -38,11 +38,12 @@ entry:
   %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
   ret <8 x half> %2
 }
+
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
 declare <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
 
-define dso_local arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmaxnmq_m_f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -59,3 +60,32 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
 declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
+
+define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxnmq_x_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmt.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_x_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxnmq_x_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmt.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
+  ret <4 x float> %2
+}
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
index 09a7d60cd165..622aa1ccd33a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
@@ -1,89 +1,132 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
-define dso_local arm_aapcs_vfpcc <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vmaxq_u8:
+define arm_aapcs_vfpcc <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxq_s8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmin.u8 q0, q0, q1
+; CHECK-NEXT:    vmax.s8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = icmp ugt <16 x i8> %a, %b
+  %0 = icmp slt <16 x i8> %a, %b
   %1 = select <16 x i1> %0, <16 x i8> %b, <16 x i8> %a
   ret <16 x i8> %1
 }
 
-define dso_local arm_aapcs_vfpcc <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vmaxq_s16:
+define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxq_u16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmin.s16 q0, q0, q1
+; CHECK-NEXT:    vmax.u16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = icmp sgt <8 x i16> %a, %b
+  %0 = icmp ult <8 x i16> %a, %b
   %1 = select <8 x i1> %0, <8 x i16> %b, <8 x i16> %a
   ret <8 x i16> %1
 }
 
-define dso_local arm_aapcs_vfpcc <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vmaxq_u32:
+define arm_aapcs_vfpcc <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxq_s32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmin.u32 q0, q0, q1
+; CHECK-NEXT:    vmax.s32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = icmp ugt <4 x i32> %a, %b
+  %0 = icmp slt <4 x i32> %a, %b
   %1 = select <4 x i1> %0, <4 x i32> %b, <4 x i32> %a
   ret <4 x i32> %1
 }
 
-define dso_local arm_aapcs_vfpcc <16 x i8> @test_vmaxq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
-; CHECK-LABEL: test_vmaxq_m_s8:
+define arm_aapcs_vfpcc <16 x i8> @test_vmaxq_m_u8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmaxq_m_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmint.s8 q0, q1, q2
+; CHECK-NEXT:    vmaxt.s8 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
 
-declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+declare <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
 
-define dso_local arm_aapcs_vfpcc <8 x i16> @test_vmaxq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
-; CHECK-LABEL: test_vmaxq_m_u16:
+define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmaxq_m_s16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmint.s16 q0, q1, q2
+; CHECK-NEXT:    vmaxt.s16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
-declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+declare <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
 
-define dso_local arm_aapcs_vfpcc <4 x i32> @test_vmaxq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
-; CHECK-LABEL: test_vmaxq_m_s32:
+define arm_aapcs_vfpcc <4 x i32> @test_vmaxq_m_u32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmaxq_m_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmint.s32 q0, q1, q2
+; CHECK-NEXT:    vmaxt.s32 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
-declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmaxq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmaxq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxt.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_x_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmaxq_x_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxt.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmaxq_x_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmaxq_x_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxt.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
index 10cd674d39a8..ae4490995238 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
-define dso_local arm_aapcs_vfpcc <8 x half> @test_vminnmq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <8 x half> @test_vminnmq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vminnmq_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vminnm.f16 q0, q0, q1
@@ -13,7 +13,7 @@ entry:
 
 declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>) #1
 
-define dso_local arm_aapcs_vfpcc <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vminnmq_f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vminnm.f32 q0, q0, q1
@@ -25,7 +25,7 @@ entry:
 
 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1
 
-define dso_local arm_aapcs_vfpcc <8 x half> @test_vminnmq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <8 x half> @test_vminnmq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vminnmq_m_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -43,7 +43,7 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
 declare <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
 
-define dso_local arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vminnmq_m_f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -60,3 +60,32 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
 declare <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
+
+define arm_aapcs_vfpcc <8 x half> @test_vminnmq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vminnmq_x_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmt.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vminnmq_x_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vminnmq_x_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmt.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
+  ret <4 x float> %2
+}
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
index 0cbef86c928f..96bc2233f226 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
-define dso_local arm_aapcs_vfpcc <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vminq_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmin.u8 q0, q0, q1
@@ -12,7 +12,7 @@ entry:
   ret <16 x i8> %1
 }
 
-define dso_local arm_aapcs_vfpcc <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vminq_s16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmin.s16 q0, q0, q1
@@ -23,7 +23,7 @@ entry:
   ret <8 x i16> %1
 }
 
-define dso_local arm_aapcs_vfpcc <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vminq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmin.u32 q0, q0, q1
@@ -34,7 +34,7 @@ entry:
   ret <4 x i32> %1
 }
 
-define dso_local arm_aapcs_vfpcc <16 x i8> @test_vminq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+define arm_aapcs_vfpcc <16 x i8> @test_vminq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vminq_m_s8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -52,7 +52,7 @@ declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
 
 declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
 
-define dso_local arm_aapcs_vfpcc <8 x i16> @test_vminq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+define arm_aapcs_vfpcc <8 x i16> @test_vminq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vminq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -70,7 +70,7 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
 declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
 
-define dso_local arm_aapcs_vfpcc <4 x i32> @test_vminq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+define arm_aapcs_vfpcc <4 x i32> @test_vminq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vminq_m_s32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -87,3 +87,46 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
 declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <16 x i8> @test_vminq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vminq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmint.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vminq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vminq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmint.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vminq_x_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vminq_x_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmint.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
index 78ee17b55416..ba15af6c5490 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
@@ -90,3 +90,46 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
 declare <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulhq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulht.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulhq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulht.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulhq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulht.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
index 78f24fa62c24..e443d54f0207 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
@@ -50,7 +50,7 @@ entry:
 
 declare <4 x i32> @llvm.arm.mve.vmull.poly.v4i32.v8i16(<8 x i16>, <8 x i16>, i32) #1
 
-define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_m_s8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_int_m_s8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -60,15 +60,15 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_m_s8(<16 x i8> %inactive, <16
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
+declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1
 
-define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_m_u16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_int_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -78,15 +78,15 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_m_u16(<8 x i16> %inactive, <8
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
+declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <4 x i32>) #1
 
-define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_int_m_s32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -96,15 +96,15 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_m_s32(<4 x i32> %inactive, <4
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> %inactive)
   ret <2 x i64> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
+declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) #1
 
-define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_poly_m_p8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_poly_m_p8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -114,8 +114,68 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_poly_m_p8(<16 x i8> %inactive, <1
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
-declare <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
+declare <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmullbq_int_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmullbt.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmullbq_int_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmullbt.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmullbq_int_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmullbt.s32 q2, q0, q1
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> undef)
+  ret <2 x i64> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_poly_x_p16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmullbq_poly_x_p16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmullbt.p16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mull.poly.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+declare <4 x i32> @llvm.arm.mve.mull.poly.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <4 x i32>) #1
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
index 5dde90e298a7..ca2aae6034a2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
@@ -50,7 +50,7 @@ entry:
 
 declare <4 x i32> @llvm.arm.mve.vmull.poly.v4i32.v8i16(<8 x i16>, <8 x i16>, i32) #1
 
-define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_m_s8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_int_m_s8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -60,15 +60,15 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_m_s8(<16 x i8> %inactive, <16
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
+declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1
 
-define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_m_u16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_int_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -78,15 +78,15 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_m_u16(<8 x i16> %inactive, <8
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
+declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <4 x i32>) #1
 
-define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_int_m_s32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -96,15 +96,15 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_m_s32(<4 x i32> %inactive, <4
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> %inactive)
   ret <2 x i64> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
+declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) #1
 
-define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_poly_m_p8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_poly_m_p8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -114,8 +114,66 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_poly_m_p8(<16 x i8> %inactive, <1
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulltq_int_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulltt.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulltq_int_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulltt.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulltq_int_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulltt.s32 q2, q0, q1
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> undef)
+  ret <2 x i64> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_poly_x_p8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulltq_poly_x_p8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulltt.p8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
-declare <8 x i16> @llvm.arm.mve.mull.poly.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
index 09d8e11a71ae..19ed08a77a22 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
@@ -1,7 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
-define arm_aapcs_vfpcc <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) {
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.i8 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = mul <16 x i8> %b, %a
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.i16 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = mul <8 x i16> %b, %a
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
@@ -11,17 +31,17 @@ entry:
   ret <4 x i32> %0
 }
 
-define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) {
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulq_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f32 q0, q1, q0
+; CHECK-NEXT:    vmul.f32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = fmul <4 x float> %b, %a
+  %0 = fmul <4 x float> %a, %b
   ret <4 x float> %0
 }
 
-define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vmulq_m_s8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -35,11 +55,47 @@ entry:
   ret <16 x i8> %2
 }
 
-declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
+
+declare <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmulq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
-declare <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)
+declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
 
-define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) {
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmulq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vmulq_m_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
@@ -53,6 +109,63 @@ entry:
   ret <8 x half> %2
 }
 
-declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmulq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmulq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmulq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmulq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
+  ret <4 x float> %2
+}
+
+declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
 
-declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vornq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vornq.ll
index 48f6a3cd23ad..be4ad54dd13c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vornq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vornq.ll
@@ -12,26 +12,26 @@ entry:
   ret <16 x i8> %1
 }
 
-define arm_aapcs_vfpcc <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vornq_u32:
+define arm_aapcs_vfpcc <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vornq_s16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorn q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
-  %1 = or <4 x i32> %0, %a
-  ret <4 x i32> %1
+  %0 = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %1 = or <8 x i16> %0, %a
+  ret <8 x i16> %1
 }
 
-define arm_aapcs_vfpcc <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vornq_s16:
+define arm_aapcs_vfpcc <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vornq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorn q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-  %1 = or <8 x i16> %0, %a
-  ret <8 x i16> %1
+  %0 = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %1 = or <4 x i32> %0, %a
+  ret <4 x i32> %1
 }
 
 define arm_aapcs_vfpcc <4 x float> @test_vornq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
@@ -84,25 +84,98 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
 declare <8 x i16> @llvm.arm.mve.orn.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
 
-; Function Attrs: nounwind readnone
-define arm_aapcs_vfpcc <8 x half> @test_vornq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
-; CHECK-LABEL: test_vornq_m_f32:
+define arm_aapcs_vfpcc <4 x i32> @test_vornq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vornq_m_s32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vornt q0, q1, q2
 ; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <8 x half> @test_vornq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vornq_m_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vornt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %1 = bitcast <8 x half> %b to <8 x i16>
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = bitcast <8 x half> %inactive to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.arm.mve.orn.predicated.v8i16.v8i1(<8 x i16> %0, <8 x i16> %1, <8 x i1> %3, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vornq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vornq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vornt q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.orn.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vornq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vornq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vornt q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.orn.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vornq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vornq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vornt q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vornq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vornq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vornt q0, q0, q1
+; CHECK-NEXT:    bx lr
 entry:
   %0 = bitcast <4 x float> %a to <4 x i32>
   %1 = bitcast <4 x float> %b to <4 x i32>
   %2 = zext i16 %p to i32
   %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
-  %4 = bitcast <4 x float> %inactive to <4 x i32>
-  %5 = tail call <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
-  %6 = bitcast <4 x i32> %5 to <8 x half>
-  ret <8 x half> %6
+  %4 = tail call <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> undef)
+  %5 = bitcast <4 x i32> %4 to <4 x float>
+  ret <4 x float> %5
 }
 
-declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-
-declare <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vorrq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vorrq.ll
index ccb511a85e57..168dde0806b0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vorrq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vorrq.ll
@@ -11,24 +11,24 @@ entry:
   ret <16 x i8> %0
 }
 
-define arm_aapcs_vfpcc <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vorrq_u32:
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vorrq_s16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = or <4 x i32> %b, %a
-  ret <4 x i32> %0
+  %0 = or <8 x i16> %b, %a
+  ret <8 x i16> %0
 }
 
-define arm_aapcs_vfpcc <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vorrq_s16:
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vorrq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = or <8 x i16> %b, %a
-  ret <8 x i16> %0
+  %0 = or <4 x i32> %b, %a
+  ret <4 x i32> %0
 }
 
 define arm_aapcs_vfpcc <4 x float> @test_vorrq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
@@ -80,25 +80,98 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
 declare <8 x i16> @llvm.arm.mve.orr.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
 
-; Function Attrs: nounwind readnone
-define arm_aapcs_vfpcc <8 x half> @test_vorrq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
-; CHECK-LABEL: test_vorrq_m_f32:
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vorrq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <8 x half> @test_vorrq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vorrq_m_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vorrt q0, q1, q2
 ; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %1 = bitcast <8 x half> %b to <8 x i16>
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = bitcast <8 x half> %inactive to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.arm.mve.orr.predicated.v8i16.v8i1(<8 x i16> %0, <8 x i16> %1, <8 x i1> %3, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vorrq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vorrq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vorrq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.orr.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vorrq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vorrq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vorrq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q0, q1
+; CHECK-NEXT:    bx lr
 entry:
   %0 = bitcast <4 x float> %a to <4 x i32>
   %1 = bitcast <4 x float> %b to <4 x i32>
   %2 = zext i16 %p to i32
   %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
-  %4 = bitcast <4 x float> %inactive to <4 x i32>
-  %5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
-  %6 = bitcast <4 x i32> %5 to <8 x half>
-  ret <8 x half> %6
+  %4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> undef)
+  %5 = bitcast <4 x i32> %4 to <4 x float>
+  ret <4 x float> %5
 }
 
-declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-
-declare <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
index 0ba47befa379..15b23d5ed277 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
@@ -90,3 +90,46 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
 declare <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrhaddq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrhaddt.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_x_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrhaddq_x_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrhaddt.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrhaddq_x_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrhaddt.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
index 3975e4eca872..9418f581eb4e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
@@ -90,3 +90,46 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
 declare <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrmulhq_x_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmulht.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrmulhq_x_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmulht.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_m_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrmulhq_m_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmulht.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll
new file mode 100644
index 000000000000..2959e61e0d8e
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsubq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vsubq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vsub.i32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = sub <4 x i32> %a, %b
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vsubq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vsub.f16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = fsub <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vsubq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vsubq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.i8 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
+
+declare <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+
+define arm_aapcs_vfpcc <4 x float> @test_vsubq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vsubq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+  ret <4 x float> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
+
+define arm_aapcs_vfpcc <8 x i16> @test_vsubq_x_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vsubq_x_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.i16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vsubq_x_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.f16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+  ret <8 x half> %2
+}
+
+declare <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+


        


[llvm] 228c740 - [ARM][MVE][Intrinsics] Add *_x() variants of my *_m() intrinsics.

[llvm] 228c740 - [ARM][MVE][Intrinsics] Add _x() variants of my _m() intrinsics.