[clang] 489f62e - [ARM,MVE] Add vector-scalar intrinsics
Mikhail Maltsev via cfe-commits
cfe-commits at lists.llvm.org
Mon Feb 17 09:47:21 PST 2020
Author: Mikhail Maltsev
Date: 2020-02-17T17:47:05Z
New Revision: 489f62e8011f54fc94d9c3a4cc2d1d66d3b5bc49
URL: https://github.com/llvm/llvm-project/commit/489f62e8011f54fc94d9c3a4cc2d1d66d3b5bc49
DIFF: https://github.com/llvm/llvm-project/commit/489f62e8011f54fc94d9c3a4cc2d1d66d3b5bc49.diff
LOG: [ARM,MVE] Add vector-scalar intrinsics
Summary:
This patch adds vector-scalar variants to the following families of
MVE intrinsics:
* vaddq
* vsubq
* vmulq
* vqaddq
* vqsubq
* vhaddq
* vhsubq
* vqdmulhq
* vqrdmulhq
The vector-scalar variants perform a splat operation on the scalar
operand and then perform the same operations as their vector-vector
counterparts. Code generation is done accordingly (using LLVM IR 'insert'
and 'shuffle' operations which are later converted into an ARMvdup
SDNode).
Reviewers: simon_tatham, dmgreen, MarkMurrayARM, ostannard
Reviewed By: dmgreen
Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits
Tags: #clang, #llvm
Differential Revision: https://reviews.llvm.org/D74620
Added:
Modified:
clang/include/clang/Basic/arm_mve.td
clang/include/clang/Basic/arm_mve_defs.td
clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
llvm/lib/Target/ARM/ARMInstrMVE.td
llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 4edef930f3b4..5cd88b07ebaa 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -43,6 +43,12 @@ def vqaddq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(IRIntBase<"sadd_sat", [Vector]> $a, $b)>;
def vqsubq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(IRIntBase<"ssub_sat", [Vector]> $a, $b)>;
+let pnt = PNT_NType in {
+ def vqaddq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (IRIntBase<"sadd_sat", [Vector]> $a, (splat $b))>;
+ def vqsubq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (IRIntBase<"ssub_sat", [Vector]> $a, (splat $b))>;
+}
}
let params = T.Unsigned in {
def vqaddq_u: Intrinsic<Vector, (args Vector:$a, Vector:$b),
@@ -51,6 +57,14 @@ def vqaddq_u: Intrinsic<Vector, (args Vector:$a, Vector:$b),
def vqsubq_u: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(IRIntBase<"usub_sat", [Vector]> $a, $b)>,
NameOverride<"vqsubq">;
+let pnt = PNT_NType in {
+ def vqaddq_u_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (IRIntBase<"uadd_sat", [Vector]> $a, (splat $b))>,
+ NameOverride<"vqaddq_n">;
+ def vqsubq_u_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (IRIntBase<"usub_sat", [Vector]> $a, (splat $b))>,
+ NameOverride<"vqsubq_n">;
+}
}
// Some intrinsics below are implemented not as IR fragments, but as
@@ -85,12 +99,32 @@ def vmullbq_int: Intrinsic<DblVector, (args Vector:$a, Vector:$b),
def vmulltq_int: Intrinsic<DblVector, (args Vector:$a, Vector:$b),
(IRInt<"vmull", [DblVector, Vector]>
$a, $b, (unsignedflag Scalar), 1)>;
+let pnt = PNT_NType in {
+ def vaddq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (add $a, (splat $b))>;
+ def vsubq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (sub $a, (splat $b))>;
+ def vmulq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (mul $a, (splat $b))>;
+ def vhaddq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (IRInt<"vhadd", [Vector]> $a, (splat $b),
+ (unsignedflag Scalar))>;
+ def vhsubq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (IRInt<"vhsub", [Vector]> $a, (splat $b),
+ (unsignedflag Scalar))>;
+}
}
let params = T.Signed in {
def vqdmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(IRInt<"vqdmulh", [Vector]> $a, $b)>;
def vqrdmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
(IRInt<"vqrdmulh", [Vector]> $a, $b)>;
+let pnt = PNT_NType in {
+ def vqdmulhq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (IRInt<"vqdmulh", [Vector]> $a, (splat $b))>;
+ def vqrdmulhq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (IRInt<"vqrdmulh", [Vector]> $a, (splat $b))>;
+}
}
let params = T.Poly, overrideKindLetter = "p" in {
@@ -114,6 +148,18 @@ def vsubqf: Intrinsic<Vector, (args Vector:$a, Vector:$b), (fsub $a, $b)>,
NameOverride<"vsubq">;
def vmulqf: Intrinsic<Vector, (args Vector:$a, Vector:$b), (fmul $a, $b)>,
NameOverride<"vmulq">;
+
+let pnt = PNT_NType in {
+ def vaddqf_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (fadd $a, (splat $b))>,
+ NameOverride<"vaddq_n">;
+ def vsubqf_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (fsub $a, (splat $b))>,
+ NameOverride<"vsubq_n">;
+ def vmulqf_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+ (fmul $a, (splat $b))>,
+ NameOverride<"vmulq_n">;
+}
}
let params = !listconcat(T.Int16, T.Int32) in {
@@ -217,6 +263,16 @@ multiclass VectorVectorArithmetic<string operation, dag extraArgs = (?),
extraArgs, (? $pred, $inactive)), wantXVariant>;
}
+multiclass VectorScalarArithmetic<string operation, string basename,
+ dag extraArgs = (?),
+ int wantXVariant = 1> {
+ defm "" : IntrinsicMXNameOverride<
+ Vector, (args Vector:$a, unpromoted<Scalar>:$b, Predicate:$pred),
+ !con((IRInt<operation, [Vector, Predicate]> $a, (splat $b)),
+ extraArgs, (? $pred, $inactive)), basename, wantXVariant, "_n",
+ PNT_NType, PNT_NType>;
+}
+
multiclass VectorVectorArithmeticBitcast<string operation> {
defm "" : IntrinsicMX<Vector, (args Vector:$a, Vector:$b,
Predicate:$pred),
@@ -238,6 +294,10 @@ let params = T.Usual in {
defm veorq : VectorVectorArithmeticBitcast<"eor_predicated">;
defm vornq : VectorVectorArithmeticBitcast<"orn_predicated">;
defm vorrq : VectorVectorArithmeticBitcast<"orr_predicated">;
+
+ defm : VectorScalarArithmetic<"add_predicated", "vaddq">;
+ defm : VectorScalarArithmetic<"sub_predicated", "vsubq">;
+ defm : VectorScalarArithmetic<"mul_predicated", "vmulq">;
}
multiclass DblVectorVectorArithmetic<string operation, dag extraArgs = (?)> {
@@ -260,6 +320,11 @@ let params = T.Int in {
defm vhsubq : VectorVectorArithmetic<"hsub_predicated", (? (unsignedflag Scalar))>;
defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 0))>;
defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 1))>;
+
+ defm : VectorScalarArithmetic<"qadd_predicated", "vqaddq", (? (unsignedflag Scalar)), 0>;
+ defm : VectorScalarArithmetic<"hadd_predicated", "vhaddq", (? (unsignedflag Scalar))>;
+ defm : VectorScalarArithmetic<"qsub_predicated", "vqsubq", (? (unsignedflag Scalar)), 0>;
+ defm : VectorScalarArithmetic<"hsub_predicated", "vhsubq", (? (unsignedflag Scalar))>;
}
let params = T.Signed in {
defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", (?), 0>;
@@ -268,6 +333,9 @@ let params = T.Signed in {
(IRInt<"vmina_predicated", [UVector,Predicate]> $a, $b, $pred)>;
def vmaxaq_m: Intrinsic<UVector, (args UVector:$a, Vector:$b, Predicate:$pred),
(IRInt<"vmaxa_predicated", [UVector,Predicate]> $a, $b, $pred)>;
+
+ defm : VectorScalarArithmetic<"qdmulh_predicated", "vqdmulhq", (?), 0>;
+ defm : VectorScalarArithmetic<"qrdmulh_predicated", "vqrdmulhq", (?), 0>;
}
let params = T.Poly, overrideKindLetter = "p" in {
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index 2a462974bb12..d4e821589cfd 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -495,6 +495,29 @@ multiclass IntrinsicMX<Type rettype, dag arguments, dag cg,
}
}
+// Same as above, but with an additional parameter 'basename' which overrides
+// the C intrinsic base name
+multiclass IntrinsicMXNameOverride<Type rettype, dag arguments, dag cg,
+ string basename, int wantXVariant = 1,
+ string nameSuffix = "",
+ PolymorphicNameType pnt_m = PNT_Type,
+ PolymorphicNameType pnt_x = PNT_Type> {
+ def "_m" # nameSuffix:
+ Intrinsic<rettype, !con((args rettype:$inactive), arguments), cg>,
+ NameOverride<basename # "_m" # nameSuffix> {
+ let pnt = pnt_m;
+ }
+
+ foreach unusedVar = !if(!eq(wantXVariant, 1), [1], []<int>) in {
+ def "_x" # nameSuffix:
+ Intrinsic<rettype, arguments, (seq (undef rettype):$inactive, cg)>,
+ NameOverride<basename # "_x" # nameSuffix> {
+ let pnt = pnt_x;
+ }
+ }
+}
+
+
// -----------------------------------------------------------------------------
// Convenience lists of parameter types. 'T' is just a container record, so you
// can define a typical intrinsic with 'let Params = T.Usual', or similar,
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
index 1810c7e89b54..1904c6e835c7 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
@@ -1,6 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
#include <arm_mve.h>
@@ -95,3 +95,113 @@ float16x8_t test_vaddq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
return vaddq_x_f16(a, b, p);
#endif /* POLYMORPHIC */
}
+
+// CHECK-LABEL: @test_vaddq_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vaddq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+ return vaddq(a, b);
+#else /* POLYMORPHIC */
+ return vaddq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_n_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = fadd <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vaddq_n_f16(float16x8_t a, float16_t b)
+{
+#ifdef POLYMORPHIC
+ return vaddq(a, b);
+#else /* POLYMORPHIC */
+ return vaddq_n_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_m_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vaddq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_m_n_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vaddq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vaddq_m_n_f32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_x_n_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vaddq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddq_x_n_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_x_n_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef)
+// CHECK-NEXT: ret <8 x half> [[TMP4]]
+//
+float16x8_t test_vaddq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vaddq_x_n_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
index 1d97ea80520c..cd61bc782c7d 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
@@ -141,3 +141,159 @@ uint32x4_t test_vhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
return vhaddq_x_u32(a, b, p);
#endif /* POLYMORPHIC */
}
+
+// CHECK-LABEL: @test_vhaddq_n_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT: ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vhaddq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+ return vhaddq(a, b);
+#else /* POLYMORPHIC */
+ return vhaddq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vhaddq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+ return vhaddq(a, b);
+#else /* POLYMORPHIC */
+ return vhaddq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vhaddq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+ return vhaddq(a, b);
+#else /* POLYMORPHIC */
+ return vhaddq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_m_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vhaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vhaddq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_m_n_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vhaddq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vhaddq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_m_n_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vhaddq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vhaddq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_x_n_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vhaddq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vhaddq_x_n_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_x_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vhaddq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vhaddq_x_n_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_x_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vhaddq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vhaddq_x_n_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
index 633fd32d947f..529936998b37 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
@@ -93,3 +93,159 @@ int32x4_t test_vhsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
return vhsubq_m_s32(inactive, a, b, p);
#endif /* POLYMORPHIC */
}
+
+// CHECK-LABEL: @test_vhsubq_n_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT: ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vhsubq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+ return vhsubq(a, b);
+#else /* POLYMORPHIC */
+ return vhsubq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vhsubq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+ return vhsubq(a, b);
+#else /* POLYMORPHIC */
+ return vhsubq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vhsubq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+ return vhsubq(a, b);
+#else /* POLYMORPHIC */
+ return vhsubq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_m_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vhsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vhsubq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_m_n_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vhsubq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vhsubq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_m_n_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vhsubq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vhsubq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_x_n_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vhsubq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vhsubq_x_n_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_x_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vhsubq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vhsubq_x_n_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_x_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vhsubq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vhsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vhsubq_x_n_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
index 536bc7322fc2..3619dabb81e1 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
@@ -1,6 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
#include <arm_mve.h>
@@ -172,14 +172,14 @@ uint32x4_t test_vmulq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
#endif /* POLYMORPHIC */
}
-// CHECK-LABEL: @test_vmulq_m_f32(
+// CHECK-LABEL: @test_vmulq_x_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
// CHECK-NEXT: ret <4 x float> [[TMP2]]
//
-float32x4_t test_vmulq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+float32x4_t test_vmulq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
{
#ifdef POLYMORPHIC
return vmulq_x(a, b, p);
@@ -187,3 +187,213 @@ float32x4_t test_vmulq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
return vmulq_x_f32(a, b, p);
#endif /* POLYMORPHIC */
}
+
+// CHECK-LABEL: @test_vmulq_n_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = mul <16 x i8> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vmulq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+ return vmulq(a, b);
+#else /* POLYMORPHIC */
+ return vmulq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = mul <8 x i16> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+ return vmulq(a, b);
+#else /* POLYMORPHIC */
+ return vmulq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+ return vmulq(a, b);
+#else /* POLYMORPHIC */
+ return vmulq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_n_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = fmul <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b)
+{
+#ifdef POLYMORPHIC
+ return vmulq(a, b);
+#else /* POLYMORPHIC */
+ return vmulq_n_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vmulq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vmulq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_n_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmulq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vmulq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_n_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vmulq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vmulq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_n_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <8 x half> [[TMP4]]
+//
+float16x8_t test_vmulq_m_n_f16(float16x8_t inactive, float16x8_t a, float16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vmulq_m_n_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_x_n_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vmulq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vmulq_x_n_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_x_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmulq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vmulq_x_n_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+// CHECK-LABEL: @test_vmulq_x_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vmulq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vmulq_x_n_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_x_n_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NEXT: ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vmulq_x_n_f32(float32x4_t a, float32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vmulq_x_n_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
index 8361ab3618ed..8ed34dd752f3 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
@@ -93,3 +93,105 @@ int32x4_t test_vqaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
return vqaddq_m_s32(inactive, a, b, p);
#endif /* POLYMORPHIC */
}
+
+// CHECK-LABEL: @test_vqaddq_n_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vqaddq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+ return vqaddq(a, b);
+#else /* POLYMORPHIC */
+ return vqaddq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT: ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vqaddq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+ return vqaddq(a, b);
+#else /* POLYMORPHIC */
+ return vqaddq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vqaddq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+ return vqaddq(a, b);
+#else /* POLYMORPHIC */
+ return vqaddq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_m_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vqaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqaddq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_m_n_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vqaddq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqaddq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_m_n_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqaddq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqaddq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
index eb7e0a0afdf3..bab911f385e7 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
@@ -93,3 +93,105 @@ int32x4_t test_vqdmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_
return vqdmulhq_m_s32(inactive, a, b, p);
#endif /* POLYMORPHIC */
}
+
+// CHECK-LABEL: @test_vqdmulhq_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <16 x i8> [[TMP0]]
+//
+int8x16_t test_vqdmulhq_n_s8(int8x16_t a, int8_t b)
+{
+#ifdef POLYMORPHIC
+ return vqdmulhq(a, b);
+#else /* POLYMORPHIC */
+ return vqdmulhq_n_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT: ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+ return vqdmulhq(a, b);
+#else /* POLYMORPHIC */
+ return vqdmulhq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_n_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b)
+{
+#ifdef POLYMORPHIC
+ return vqdmulhq(a, b);
+#else /* POLYMORPHIC */
+ return vqdmulhq_n_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_m_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qdmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vqdmulhq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqdmulhq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_m_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qdmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vqdmulhq_m_n_s16(int16x8_t inactive, int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqdmulhq_m_n_s16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_m_n_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqdmulhq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqdmulhq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
index 27b7efd31014..919074e7fddb 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
@@ -93,3 +93,105 @@ int32x4_t test_vqrdmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve
return vqrdmulhq_m_s32(inactive, a, b, p);
#endif /* POLYMORPHIC */
}
+
+// CHECK-LABEL: @test_vqrdmulhq_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <16 x i8> [[TMP0]]
+//
+int8x16_t test_vqrdmulhq_n_s8(int8x16_t a, int8_t b)
+{
+#ifdef POLYMORPHIC
+ return vqrdmulhq(a, b);
+#else /* POLYMORPHIC */
+ return vqrdmulhq_n_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT: ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+ return vqrdmulhq(a, b);
+#else /* POLYMORPHIC */
+ return vqrdmulhq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_n_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b)
+{
+#ifdef POLYMORPHIC
+ return vqrdmulhq(a, b);
+#else /* POLYMORPHIC */
+ return vqrdmulhq_n_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_m_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qrdmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vqrdmulhq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqrdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqrdmulhq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_m_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qrdmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vqrdmulhq_m_n_s16(int16x8_t inactive, int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqrdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqrdmulhq_m_n_s16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_m_n_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqrdmulhq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqrdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqrdmulhq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
index c8924bf91280..28ced90ef0f7 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
@@ -93,3 +93,105 @@ int32x4_t test_vqsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
return vqsubq_m_s32(inactive, a, b, p);
#endif /* POLYMORPHIC */
}
+
+// CHECK-LABEL: @test_vqsubq_n_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vqsubq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+ return vqsubq(a, b);
+#else /* POLYMORPHIC */
+ return vqsubq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_n_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT: ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vqsubq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+ return vqsubq(a, b);
+#else /* POLYMORPHIC */
+ return vqsubq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vqsubq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+ return vqsubq(a, b);
+#else /* POLYMORPHIC */
+ return vqsubq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_m_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vqsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqsubq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_m_n_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vqsubq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqsubq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_m_n_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqsubq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vqsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vqsubq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
index dd11aa848bcc..7231ae966306 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
@@ -1,6 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
#include <arm_mve.h>
@@ -95,3 +95,113 @@ float16x8_t test_vsubq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
return vsubq_x_f16(a, b, p);
#endif /* POLYMORPHIC */
}
+
+// CHECK-LABEL: @test_vsubq_n_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = sub <4 x i32> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vsubq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+ return vsubq(a, b);
+#else /* POLYMORPHIC */
+ return vsubq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_n_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vsubq_n_f16(float16x8_t a, float16_t b)
+{
+#ifdef POLYMORPHIC
+ return vsubq(a, b);
+#else /* POLYMORPHIC */
+ return vsubq_n_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_m_n_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vsubq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_m_n_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vsubq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+ return vsubq_m_n_f32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_x_n_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vsubq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vsubq_x_n_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_x_n_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef)
+// CHECK-NEXT: ret <8 x half> [[TMP4]]
+//
+float16x8_t test_vsubq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+ return vsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+ return vsubq_x_n_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 8a03c50d1094..5a2bb9c89c28 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4480,10 +4480,65 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
let Inst{3-0} = Rm{3-0};
}
+// Patterns for vector-scalar instructions with integer operands
+multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI,
+ SDNode unpred_op, SDNode pred_op,
+ bit unpred_has_sign = 0,
+ bit pred_has_sign = 0> {
+ defvar UnpredSign = !if(unpred_has_sign, (? (i32 VTI.Unsigned)), (?));
+ defvar PredSign = !if(pred_has_sign, (? (i32 VTI.Unsigned)), (?));
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated version
+ def : Pat<(VTI.Vec !con((unpred_op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (ARMvdup rGPR:$val))),
+ UnpredSign)),
+ (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>;
+ // Predicated version
+ def : Pat<(VTI.Vec !con((pred_op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (ARMvdup rGPR:$val))),
+ PredSign,
+ (pred_op (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))),
+ (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
+
+// Patterns for vector-scalar instructions with FP operands
+multiclass MVE_vec_scalar_fp_pat_m<SDNode unpred_op, Intrinsic pred_int,
+ Instruction instr_f16,
+ Instruction instr_f32> {
+ let Predicates = [HasMVEFloat] in {
+ // Unpredicated F16
+ def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)))),
+ (v8f16 (instr_f16 (v8f16 MQPR:$Qm),
+ (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR))))>;
+ // Unpredicated F32
+ def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)))),
+ (v4f32 (instr_f32 (v4f32 MQPR:$Qm),
+ (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR))))>;
+ // Predicated F16
+ def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)),
+ (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+ (v8f16 (instr_f16 (v8f16 MQPR:$Qm),
+ (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR)),
+ ARMVCCThen, (v8i1 VCCR:$mask),
+ (v8f16 MQPR:$inactive)))>;
+ // Preicated F32
+ def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)),
+ (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+ (v4f32 (instr_f32 (v4f32 MQPR:$Qm),
+ (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR)),
+ ARMVCCThen, (v4i1 VCCR:$mask),
+ (v4f32 MQPR:$inactive)))>;
+ }
+}
+
class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
- bit bit_5, bit bit_12, bit bit_16,
- bit bit_28, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+ bit bit_5, bit bit_12, bit bit_16, bit bit_28>
+ : MVE_qDest_rSrc<iname, suffix, ""> {
let Inst{28} = bit_28;
let Inst{21-20} = size;
@@ -4494,42 +4549,60 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
let validForTailPredication = 1;
}
-multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix,
- bit bit_5, bit bit_12, bit bit_16,
- bit bit_28, list<dag> pattern=[]> {
- def "8" : MVE_VADDSUB_qr<iname, suffix#"8", 0b00,
- bit_5, bit_12, bit_16, bit_28>;
- def "16" : MVE_VADDSUB_qr<iname, suffix#"16", 0b01,
- bit_5, bit_12, bit_16, bit_28>;
- def "32" : MVE_VADDSUB_qr<iname, suffix#"32", 0b10,
- bit_5, bit_12, bit_16, bit_28>;
-}
-
-defm MVE_VADD_qr_i : MVE_VADDSUB_qr_sizes<"vadd", "i", 0b0, 0b0, 0b1, 0b0>;
-defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>;
-defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>;
-
-defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>;
-defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>;
-defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>;
-
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
- (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
- (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
- (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
-}
-
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
- (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
- (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
- (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
-}
+// Vector-scalar add/sub
+multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>;
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+ unpred_op, pred_int>;
+}
+
+multiclass MVE_VADD_qr_m<MVEVectorVTInfo VTI>
+ : MVE_VADDSUB_qr_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>;
+
+multiclass MVE_VSUB_qr_m<MVEVectorVTInfo VTI>
+ : MVE_VADDSUB_qr_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>;
+
+defm MVE_VADD_qr_i8 : MVE_VADD_qr_m<MVE_v16i8>;
+defm MVE_VADD_qr_i16 : MVE_VADD_qr_m<MVE_v8i16>;
+defm MVE_VADD_qr_i32 : MVE_VADD_qr_m<MVE_v4i32>;
+
+defm MVE_VSUB_qr_i8 : MVE_VSUB_qr_m<MVE_v16i8>;
+defm MVE_VSUB_qr_i16 : MVE_VSUB_qr_m<MVE_v8i16>;
+defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m<MVE_v4i32>;
+
+// Vector-scalar saturating add/sub
+multiclass MVE_VQADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+ SDNode unpred_op_s, SDNode unpred_op_u,
+ Intrinsic pred_int> {
+ def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b1, subtract,
+ 0b0, VTI.Unsigned>;
+ defvar unpred_op = !if(VTI.Unsigned, unpred_op_u, unpred_op_s);
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+ unpred_op, pred_int, 0, 1>;
+}
+
+multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI>
+ : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, saddsat, uaddsat,
+ int_arm_mve_qadd_predicated>;
+
+multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI>
+ : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, ssubsat, usubsat,
+ int_arm_mve_qsub_predicated>;
+
+defm MVE_VQADD_qr_s8 : MVE_VQADD_qr_m<MVE_v16s8>;
+defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16>;
+defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32>;
+defm MVE_VQADD_qr_u8 : MVE_VQADD_qr_m<MVE_v16u8>;
+defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16>;
+defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32>;
+
+defm MVE_VQSUB_qr_s8 : MVE_VQSUB_qr_m<MVE_v16s8>;
+defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16>;
+defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32>;
+defm MVE_VQSUB_qr_u8 : MVE_VQSUB_qr_m<MVE_v16u8>;
+defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16>;
+defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32>;
class MVE_VQDMULL_qr<string iname, string suffix, bit size,
bit T, string cstr="", list<dag> pattern=[]>
@@ -4566,19 +4639,34 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
let validForTailPredication = 1;
}
-def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>;
-def MVE_VHADD_qr_s16 : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>;
-def MVE_VHADD_qr_s32 : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>;
-def MVE_VHADD_qr_u8 : MVE_VxADDSUB_qr<"vhadd", "u8", 0b1, 0b00, 0b0>;
-def MVE_VHADD_qr_u16 : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>;
-def MVE_VHADD_qr_u32 : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>;
+multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+ Intrinsic unpred_int, Intrinsic pred_int> {
+ def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract>;
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME),
+ VTI, unpred_int, pred_int, 1, 1>;
+}
-def MVE_VHSUB_qr_s8 : MVE_VxADDSUB_qr<"vhsub", "s8", 0b0, 0b00, 0b1>;
-def MVE_VHSUB_qr_s16 : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>;
-def MVE_VHSUB_qr_s32 : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>;
-def MVE_VHSUB_qr_u8 : MVE_VxADDSUB_qr<"vhsub", "u8", 0b1, 0b00, 0b1>;
-def MVE_VHSUB_qr_u16 : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>;
-def MVE_VHSUB_qr_u32 : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>;
+multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI> :
+ MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd,
+ int_arm_mve_hadd_predicated>;
+
+multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI> :
+ MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub,
+ int_arm_mve_hsub_predicated>;
+
+defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m<MVE_v16s8>;
+defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16>;
+defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32>;
+defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m<MVE_v16u8>;
+defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16>;
+defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32>;
+
+defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m<MVE_v16s8>;
+defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16>;
+defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32>;
+defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m<MVE_v16u8>;
+defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>;
+defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
let Predicates = [HasMVEFloat] in {
def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>;
@@ -4588,6 +4676,11 @@ let Predicates = [HasMVEFloat] in {
def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>;
}
+defm : MVE_vec_scalar_fp_pat_m<fadd, int_arm_mve_add_predicated,
+ MVE_VADD_qr_f16, MVE_VADD_qr_f32>;
+defm : MVE_vec_scalar_fp_pat_m<fsub, int_arm_mve_sub_predicated,
+ MVE_VSUB_qr_f16, MVE_VSUB_qr_f32>;
+
class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
bit bit_7, bit bit_17, list<dag> pattern=[]>
: MVE_qDest_single_rSrc<iname, suffix, pattern> {
@@ -4678,9 +4771,8 @@ let Predicates = [HasMVEInt] in {
(v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>;
}
-class MVE_VMUL_qr_int<string iname, string suffix,
- bits<2> size, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size>
+ : MVE_qDest_rSrc<iname, suffix, ""> {
let Inst{28} = 0b0;
let Inst{21-20} = size;
@@ -4691,19 +4783,16 @@ class MVE_VMUL_qr_int<string iname, string suffix,
let validForTailPredication = 1;
}
-def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>;
-def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>;
-def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>;
-
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
- (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
- (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
- (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> {
+ def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>;
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+ mul, int_arm_mve_mul_predicated>;
}
+defm MVE_VMUL_qr_i8 : MVE_VMUL_qr_int_m<MVE_v16i8>;
+defm MVE_VMUL_qr_i16 : MVE_VMUL_qr_int_m<MVE_v8i16>;
+defm MVE_VMUL_qr_i32 : MVE_VMUL_qr_int_m<MVE_v4i32>;
+
class MVE_VxxMUL_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
: MVE_qDest_rSrc<iname, suffix, "", pattern> {
@@ -4716,19 +4805,37 @@ class MVE_VxxMUL_qr<string iname, string suffix,
let Inst{5} = 0b1;
}
-def MVE_VQDMULH_qr_s8 : MVE_VxxMUL_qr<"vqdmulh", "s8", 0b0, 0b00>;
-def MVE_VQDMULH_qr_s16 : MVE_VxxMUL_qr<"vqdmulh", "s16", 0b0, 0b01>;
-def MVE_VQDMULH_qr_s32 : MVE_VxxMUL_qr<"vqdmulh", "s32", 0b0, 0b10>;
+multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28,
+ Intrinsic int_unpred, Intrinsic int_pred> {
+ def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>;
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+ int_unpred, int_pred>;
+}
-def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>;
-def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>;
-def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>;
+multiclass MVE_VQDMULH_qr_m<MVEVectorVTInfo VTI> :
+ MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0,
+ int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>;
+
+multiclass MVE_VQRDMULH_qr_m<MVEVectorVTInfo VTI> :
+ MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1,
+ int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>;
+
+defm MVE_VQDMULH_qr_s8 : MVE_VQDMULH_qr_m<MVE_v16s8>;
+defm MVE_VQDMULH_qr_s16 : MVE_VQDMULH_qr_m<MVE_v8s16>;
+defm MVE_VQDMULH_qr_s32 : MVE_VQDMULH_qr_m<MVE_v4s32>;
+
+defm MVE_VQRDMULH_qr_s8 : MVE_VQRDMULH_qr_m<MVE_v16s8>;
+defm MVE_VQRDMULH_qr_s16 : MVE_VQRDMULH_qr_m<MVE_v8s16>;
+defm MVE_VQRDMULH_qr_s32 : MVE_VQRDMULH_qr_m<MVE_v4s32>;
let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
}
+defm : MVE_vec_scalar_fp_pat_m<fmul, int_arm_mve_mul_predicated,
+ MVE_VMUL_qr_f16, MVE_VMUL_qr_f32>;
+
class MVE_VFMAMLA_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, bit S,
list<dag> pattern=[]>
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
index b7becf6dd2c2..03e64cc06641 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
@@ -91,3 +91,99 @@ entry:
declare <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+define arm_aapcs_vfpcc <4 x i32> @test_vaddq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vaddq_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = add <4 x i32> %.splat, %a
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_n_f16(<8 x half> %a, float %b.coerce) {
+; CHECK-LABEL: test_vaddq_n_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vadd.f16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast float %b.coerce to i32
+ %tmp.0.extract.trunc = trunc i32 %0 to i16
+ %1 = bitcast i16 %tmp.0.extract.trunc to half
+ %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+ %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+ %2 = fadd <8 x half> %.splat, %a
+ ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vaddq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_m_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddt.i8 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vaddq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_m_n_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddt.f32 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
+ %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> %inactive)
+ ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vaddq_x_n_u16(<8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_x_n_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddt.i16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_x_n_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vaddt.f16 q0, q0, r1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast float %b.coerce to i32
+ %tmp.0.extract.trunc = trunc i32 %0 to i16
+ %1 = bitcast i16 %tmp.0.extract.trunc to half
+ %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+ %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+ %2 = zext i16 %p to i32
+ %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+ %4 = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> undef)
+ ret <8 x half> %4
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
index 9319dd1c4c4b..0a3935e9f15d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
@@ -133,3 +133,134 @@ entry:
ret <4 x i32> %2
}
+define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vhaddq_n_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vhadd.u8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> %a, <16 x i8> %.splat, i32 1)
+ ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vhaddq_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vhadd.s16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> %a, <8 x i16> %.splat, i32 0)
+ ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vhaddq_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vhadd.u32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %a, <4 x i32> %.splat, i32 1)
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_m_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhaddt.s8 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_m_n_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhaddt.u16 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_m_n_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhaddt.s32 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+ ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_x_n_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhaddt.u8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 1, <16 x i1> %1, <16 x i8> undef)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_x_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhaddt.s16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 0, <8 x i1> %1, <8 x i16> undef)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_x_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhaddt.u32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 1, <4 x i1> %1, <4 x i32> undef)
+ ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
index 5bc8a22810a0..ff474cd6c254 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
@@ -90,3 +90,135 @@ entry:
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vhsubq_n_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vhsub.u8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> %a, <16 x i8> %.splat, i32 1)
+ ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vhsubq_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vhsub.s16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> %a, <8 x i16> %.splat, i32 0)
+ ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vhsubq_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vhsub.u32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %a, <4 x i32> %.splat, i32 1)
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_m_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhsubt.s8 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_m_n_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhsubt.u16 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_m_n_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhsubt.s32 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+ ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_x_n_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhsubt.u8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 1, <16 x i1> %1, <16 x i8> undef)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_x_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhsubt.s16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 0, <8 x i1> %1, <8 x i16> undef)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_x_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vhsubt.u32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 1, <4 x i1> %1, <4 x i32> undef)
+ ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
index 19ed08a77a22..a342647e16a2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
@@ -169,3 +169,184 @@ entry:
declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vmulq_n_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmul.i8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = mul <16 x i8> %.splat, %a
+ ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vmulq_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmul.i16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = mul <8 x i16> %.splat, %a
+ ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vmulq_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmul.i32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = mul <4 x i32> %.splat, %a
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
+; CHECK-LABEL: test_vmulq_n_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmul.f32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
+ %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+ %0 = fmul <4 x float> %.splat, %a
+ ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmult.i8 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_n_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmult.i16 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_n_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmult.i32 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
+ ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_n_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r1, s8
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmult.f16 q0, q1, r1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast float %b.coerce to i32
+ %tmp.0.extract.trunc = trunc i32 %0 to i16
+ %1 = bitcast i16 %tmp.0.extract.trunc to half
+ %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+ %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+ %2 = zext i16 %p to i32
+ %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+ %4 = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> %inactive)
+ ret <8 x half> %4
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_x_n_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmult.i8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> undef)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_x_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmult.i16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_x_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmult.i32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> undef)
+ ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_x_n_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmult.f32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
+ %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> undef)
+ ret <4 x float> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
index b05fded2b307..8d3021830983 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
@@ -90,3 +90,87 @@ entry:
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqaddq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vqaddq_n_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqadd.u8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %.splat)
+ ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqaddq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqaddq_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqadd.s16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %.splat)
+ ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqaddq_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqadd.u32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %.splat)
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqaddq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqaddq_m_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqaddt.s8 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqaddq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqaddq_m_n_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqaddt.u16 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqaddq_m_n_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqaddt.s32 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+ ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll
index 58e54a9d268e..58cfeb3785fd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll
@@ -90,3 +90,87 @@ entry:
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqdmulhq_n_s8(<16 x i8> %a, i8 signext %b) {
+; CHECK-LABEL: test_vqdmulhq_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqdmulh.s8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = call <16 x i8> @llvm.arm.mve.vqdmulh.v16i8(<16 x i8> %a, <16 x i8> %.splat)
+ ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqdmulhq_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqdmulh.s16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = call <8 x i16> @llvm.arm.mve.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %.splat)
+ ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqdmulhq_n_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqdmulh.s32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %.splat)
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqdmulhq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulhq_m_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqdmulht.s8 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.qdmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulhq_m_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqdmulht.s16 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.qdmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulhq_m_n_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqdmulht.s32 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
+ ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll
index 806fb7bc930b..8c34745c57c3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll
@@ -90,3 +90,87 @@ entry:
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqrdmulhq_n_s8(<16 x i8> %a, i8 signext %b) {
+; CHECK-LABEL: test_vqrdmulhq_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmulh.s8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = call <16 x i8> @llvm.arm.mve.vqrdmulh.v16i8(<16 x i8> %a, <16 x i8> %.splat)
+ ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqrdmulhq_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmulh.s16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = call <8 x i16> @llvm.arm.mve.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %.splat)
+ ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqrdmulhq_n_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmulh.s32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %.splat)
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqrdmulhq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqrdmulhq_m_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqrdmulht.s8 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.qrdmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulhq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqrdmulhq_m_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqrdmulht.s16 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.qrdmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulhq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqrdmulhq_m_n_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqrdmulht.s32 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
+ ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
index 37f251c83a3e..9ceab4cf73de 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
@@ -90,3 +90,87 @@ entry:
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqsubq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vqsubq_n_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqsub.u8 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %.splat)
+ ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqsubq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqsubq_n_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqsub.s16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %.splat)
+ ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqsubq_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqsub.u32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %.splat)
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqsubq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqsubq_m_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqsubt.s8 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqsubq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqsubq_m_n_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqsubt.u16 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqsubq_m_n_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vqsubt.s32 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+ ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll
index 2959e61e0d8e..243ff4070af3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll
@@ -91,3 +91,99 @@ entry:
declare <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+define arm_aapcs_vfpcc <4 x i32> @test_vsubq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vsubq_n_u32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vsub.i32 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %0 = sub <4 x i32> %a, %.splat
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_n_f16(<8 x half> %a, float %b.coerce) {
+; CHECK-LABEL: test_vsubq_n_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vsub.f16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast float %b.coerce to i32
+ %tmp.0.extract.trunc = trunc i32 %0 to i16
+ %1 = bitcast i16 %tmp.0.extract.trunc to half
+ %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+ %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+ %2 = fsub <8 x half> %a, %.splat
+ ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vsubq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_m_n_s8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vsubt.i8 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+ %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+ %2 = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vsubq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_m_n_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vsubt.f32 q0, q1, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
+ %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> %inactive)
+ ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vsubq_x_n_u16(<8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_x_n_u16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vsubt.i16 q0, q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = zext i16 %p to i32
+ %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+ %2 = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
+ ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_x_n_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: vmsr p0, r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vsubt.f16 q0, q0, r1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast float %b.coerce to i32
+ %tmp.0.extract.trunc = trunc i32 %0 to i16
+ %1 = bitcast i16 %tmp.0.extract.trunc to half
+ %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+ %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+ %2 = zext i16 %p to i32
+ %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+ %4 = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> undef)
+ ret <8 x half> %4
+}
More information about the cfe-commits
mailing list