[clang] f4fd7db - [ARM,MVE] Add vqdmull[b,t]q intrinsic families

Thu Feb 20 02:52:00 PST 2020

Author: Mikhail Maltsev
Date: 2020-02-20T10:51:19Z
New Revision: f4fd7dbf85e278eff303514760bff4773a87e601

URL: https://github.com/llvm/llvm-project/commit/f4fd7dbf85e278eff303514760bff4773a87e601
DIFF: https://github.com/llvm/llvm-project/commit/f4fd7dbf85e278eff303514760bff4773a87e601.diff

LOG: [ARM,MVE] Add vqdmull[b,t]q intrinsic families

Summary:
This patch adds two families of ACLE intrinsics: vqdmullbq and
vqdmulltq (including vector-vector and vector-scalar variants) and the
corresponding LLVM IR intrinsics llvm.arm.mve.vqdmull and
llvm.arm.mve.vqdmull.predicated.

Reviewers: simon_tatham, MarkMurrayARM, dmgreen, ostannard

Reviewed By: MarkMurrayARM

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D74845

Added: 
    clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c
    clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmull.ll

Modified: 
    clang/include/clang/Basic/arm_mve.td
    llvm/include/llvm/IR/IntrinsicsARM.td
    llvm/lib/Target/ARM/ARMInstrMVE.td

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index bfb49864922f..ca7246d78bd6 100644

--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -330,11 +330,22 @@ let params = T.Usual in {
   defm : VectorScalarArithmetic<"mul_predicated", "vmulq">;
 }
 
-multiclass DblVectorVectorArithmetic<string operation, dag extraArgs = (?)> {
+multiclass DblVectorVectorArithmetic<string operation, dag extraArgs = (?),
+                                     int wantXVariant = 1> {
   defm "" : IntrinsicMX<
       DblVector, (args Vector:$a, Vector:$b, DblPredicate:$pred),
       !con((IRInt<operation, [DblVector, Vector, DblPredicate]> $a, $b),
-           extraArgs, (? $pred, $inactive))>;
+           extraArgs, (? $pred, $inactive)), wantXVariant>;
+}
+
+multiclass DblVectorScalarArithmetic<string operation, string basename,
+                                     dag extraArgs = (?),
+                                     int wantXVariant = 1> {
+  defm "" : IntrinsicMXNameOverride<
+      DblVector, (args Vector:$a, unpromoted<Scalar>:$b, DblPredicate:$pred),
+      !con((IRInt<operation, [DblVector, Vector, DblPredicate]> $a, (splat $b)),
+           extraArgs, (? $pred, $inactive)), basename, wantXVariant, "_n",
+           PNT_NType, PNT_NType>;
 }
 
 // Predicated intrinsics - Int types only
@@ -373,6 +384,28 @@ let params = T.Poly, overrideKindLetter = "p" in {
   defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 1))>;
 }
 
+let params = [s16, s32] in {
+  def  vqdmullbq:  Intrinsic<DblVector, (args Vector:$a, Vector:$b),
+                             (IRInt<"vqdmull", [DblVector, Vector]> $a, $b, 0)>;
+  def  vqdmulltq:  Intrinsic<DblVector, (args Vector:$a, Vector:$b),
+                             (IRInt<"vqdmull", [DblVector, Vector]> $a, $b, 1)>;
+  defm vqdmullbq:  DblVectorVectorArithmetic<"vqdmull_predicated", (? (u32 0)), 0>;
+  defm vqdmulltq:  DblVectorVectorArithmetic<"vqdmull_predicated", (? (u32 1)), 0>;
+
+  let pnt = PNT_NType in {
+    def vqdmullbq_n: Intrinsic<DblVector, (args Vector:$a, unpromoted<Scalar>:$b),
+                               (IRInt<"vqdmull", [DblVector, Vector]>
+                                $a, (splat $b), 0)>;
+    def vqdmulltq_n: Intrinsic<DblVector, (args Vector:$a, unpromoted<Scalar>:$b),
+                               (IRInt<"vqdmull", [DblVector, Vector]>
+                                $a, (splat $b), 1)>;
+  }
+  defm vqdmullbq_n: DblVectorScalarArithmetic<"vqdmull_predicated",
+                                              "vqdmullbq", (? (u32 0)), 0>;
+  defm vqdmulltq_n: DblVectorScalarArithmetic<"vqdmull_predicated",
+                                              "vqdmulltq", (? (u32 1)), 0>;
+}
+
 // Predicated intrinsics - Float types only
 let params = T.Float in {
   defm vminnmq : VectorVectorArithmetic<"min_predicated", (? (u32 0))>;

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c
new file mode 100644
index 000000000000..c7aa5a3c17b5
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c
@@ -0,0 +1,125 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vqdmullbq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vqdmullbq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vqdmullbq(a, b);
+#else  /* POLYMORPHIC */
+  return vqdmullbq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmullbq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+int64x2_t test_vqdmullbq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vqdmullbq(a, b);
+#else  /* POLYMORPHIC */
+  return vqdmullbq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmullbq_m_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqdmullbq_m_s16(int32x4_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vqdmullbq_m(inactive, a, b, p);
+#else  /* POLYMORPHIC */
+  return vqdmullbq_m_s16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmullbq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+//
+int64x2_t test_vqdmullbq_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vqdmullbq_m(inactive, a, b, p);
+#else  /* POLYMORPHIC */
+  return vqdmullbq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmullbq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vqdmullbq_n_s16(int16x8_t a, int16_t b) {
+#ifdef POLYMORPHIC
+  return vqdmullbq(a, b);
+#else  /* POLYMORPHIC */
+  return vqdmullbq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmullbq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+int64x2_t test_vqdmullbq_n_s32(int32x4_t a, int32_t b) {
+#ifdef POLYMORPHIC
+  return vqdmullbq(a, b);
+#else  /* POLYMORPHIC */
+  return vqdmullbq_n_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmullbq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqdmullbq_m_n_s16(int32x4_t inactive, int16x8_t a, int16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vqdmullbq_m(inactive, a, b, p);
+#else  /* POLYMORPHIC */
+  return vqdmullbq_m_n_s16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmullbq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+//
+int64x2_t test_vqdmullbq_m_n_s32(int64x2_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vqdmullbq_m(inactive, a, b, p);
+#else  /* POLYMORPHIC */
+  return vqdmullbq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c
new file mode 100644
index 000000000000..0a03e2e963df
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c
@@ -0,0 +1,125 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vqdmulltq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vqdmulltq_s16(int16x8_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vqdmulltq(a, b);
+#else  /* POLYMORPHIC */
+  return vqdmulltq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulltq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+int64x2_t test_vqdmulltq_s32(int32x4_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vqdmulltq(a, b);
+#else  /* POLYMORPHIC */
+  return vqdmulltq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulltq_m_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqdmulltq_m_s16(int32x4_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vqdmulltq_m(inactive, a, b, p);
+#else  /* POLYMORPHIC */
+  return vqdmulltq_m_s16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulltq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+//
+int64x2_t test_vqdmulltq_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vqdmulltq_m(inactive, a, b, p);
+#else  /* POLYMORPHIC */
+  return vqdmulltq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulltq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vqdmulltq_n_s16(int16x8_t a, int16_t b) {
+#ifdef POLYMORPHIC
+  return vqdmulltq(a, b);
+#else  /* POLYMORPHIC */
+  return vqdmulltq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulltq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+int64x2_t test_vqdmulltq_n_s32(int32x4_t a, int32_t b) {
+#ifdef POLYMORPHIC
+  return vqdmulltq(a, b);
+#else  /* POLYMORPHIC */
+  return vqdmulltq_n_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulltq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqdmulltq_m_n_s16(int32x4_t inactive, int16x8_t a, int16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vqdmulltq_m(inactive, a, b, p);
+#else  /* POLYMORPHIC */
+  return vqdmulltq_m_n_s16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulltq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+//
+int64x2_t test_vqdmulltq_m_n_s32(int64x2_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vqdmulltq_m(inactive, a, b, p);
+#else  /* POLYMORPHIC */
+  return vqdmulltq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 9f209e8102b9..68af4ae82579 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1168,4 +1168,14 @@ defm int_arm_mve_vbrsr: MVEMXPredicated<
   [llvm_anyvector_ty], [],
   [LLVMMatchType<0>, llvm_i32_ty], LLVMMatchType<0>, llvm_anyvector_ty>;
 
+def int_arm_mve_vqdmull: Intrinsic<
+  [llvm_anyvector_ty],
+  [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty],
+  [IntrNoMem]>;
+def int_arm_mve_vqdmull_predicated: Intrinsic<
+  [llvm_anyvector_ty],
+  [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty,
+   LLVMMatchType<0>],
+  [IntrNoMem]>;
+
 } // end TargetPrefix

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 735fc0e497c1..9a7886d6ecc5 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4577,13 +4577,34 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> {
-  def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>;
-  def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>;
+multiclass MVE_VQDMULL_m<string iname, MVEVectorVTInfo VTI, bit size, bit T,
+                         string cstr> {
+  def "" : MVE_VQDMULL<iname, VTI.Suffix, size, T, cstr>;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated saturating multiply
+    def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm),
+                                               (VTI.Vec MQPR:$Qn), (i32 T))),
+              (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+    // Predicated saturating multiply
+    def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated
+                                    (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                                    (i32 T), (VTI.DblPred VCCR:$mask),
+                                    (VTI.DblVec MQPR:$inactive))),
+              (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                                ARMVCCThen, (VTI.DblPred VCCR:$mask),
+                                (VTI.DblVec MQPR:$inactive)))>;
+  }
 }
 
-defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>;
-defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">;
+multiclass MVE_VQDMULL_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> {
+  defm bh : MVE_VQDMULL_m<"vqdmullb", VTI, size, 0b0, cstr>;
+  defm th : MVE_VQDMULL_m<"vqdmullt", VTI, size, 0b1, cstr>;
+}
+
+defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<MVE_v8s16, 0b0>;
+defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
 
 // end of mve_qDest_qSrc
 
@@ -4766,13 +4787,37 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size,
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> {
-  def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>;
-  def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>;
+multiclass MVE_VQDMULL_qr_m<string iname, MVEVectorVTInfo VTI, bit size,
+                            bit T, string cstr> {
+  def "" : MVE_VQDMULL_qr<iname, VTI.Suffix, size, T, cstr>;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated saturating multiply
+    def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm),
+                                               (VTI.Vec (ARMvdup rGPR:$val)),
+                                               (i32 T))),
+              (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>;
+    // Predicated saturating multiply
+    def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated
+                                    (VTI.Vec MQPR:$Qm),
+                                    (VTI.Vec (ARMvdup rGPR:$val)),
+                                    (i32 T),
+                                    (VTI.DblPred VCCR:$mask),
+                                    (VTI.DblVec MQPR:$inactive))),
+              (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val),
+                             ARMVCCThen, (VTI.DblPred VCCR:$mask),
+                             (VTI.DblVec MQPR:$inactive)))>;
+  }
+}
+
+multiclass MVE_VQDMULL_qr_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> {
+  defm bh : MVE_VQDMULL_qr_m<"vqdmullb", VTI, size, 0b0, cstr>;
+  defm th : MVE_VQDMULL_qr_m<"vqdmullt", VTI, size, 0b1, cstr>;
 }
 
-defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>;
-defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">;
+defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<MVE_v8s16, 0b0>;
+defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
 
 class MVE_VxADDSUB_qr<string iname, string suffix,
                       bit bit_28, bits<2> bits_21_20, bit subtract,

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmull.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmull.ll
new file mode 100644
index 000000000000..f05ea7fc2b05
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmull.ll
@@ -0,0 +1,221 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32)
+declare <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>)
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmullbq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vqdmullbq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmullb.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmullbq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vqdmullbq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmullb.s32 q2, q0, q1
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+  ret <2 x i64> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmullbq_m_s16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmullbq_m_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmullbt.s16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %a, <8 x i16> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmullbq_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmullbq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmullbt.s32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> %inactive)
+  ret <2 x i64> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmullbq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqdmullbq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmullb.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = call <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %.splat, i32 0)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmullbq_n_s32(<4 x i32> %a, i32 %b) #0 {
+; CHECK-LABEL: test_vqdmullbq_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmullb.s32 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %.splat, i32 0)
+  ret <2 x i64> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmullbq_m_n_s16(<4 x i32> %inactive, <8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmullbq_m_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmullbt.s16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %a, <8 x i16> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmullbq_m_n_s32(<2 x i64> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmullbq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmullbt.s32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <2 x i64> %inactive)
+  ret <2 x i64> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmulltq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vqdmulltq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmullt.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmulltq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vqdmulltq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmullt.s32 q2, q0, q1
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
+  ret <2 x i64> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmulltq_m_s16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulltq_m_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmulltt.s16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %a, <8 x i16> %b, i32 1, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmulltq_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulltq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmulltt.s32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> %inactive)
+  ret <2 x i64> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmulltq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqdmulltq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmullt.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = call <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %.splat, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmulltq_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqdmulltq_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmullt.s32 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %.splat, i32 1)
+  ret <2 x i64> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmulltq_m_n_s16(<4 x i32> %inactive, <8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulltq_m_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmulltt.s16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmulltq_m_n_s32(<2 x i64> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulltq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmulltt.s32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 1, <4 x i1> %1, <2 x i64> %inactive)
+  ret <2 x i64> %2
+}