[clang] [llvm] [ARM] Introduce intrinsics for MVE add/sub/mul under strict-fp. (PR #169156)

Sat Nov 22 01:42:38 PST 2025

https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/169156

As far as I understand, the MVE fp vadd/vsub/vmul instructions will set exception flags in the same ways as scalar fadd/fsub/fmul, but will not honor flush-to-zero (for f32 they always flush, for f16 they follows the fpsrc flags) and will always use the default rounding mode.

This means that we cannot convert the vadd_f23/vsub_f32/vmul_f32 intrinsics to llvm.constrained.fadd/fsub/fmul, and the fadd/fsub/fmul without changing the expected behaviour under strict-fp. This patch introduces a set in intrinsics that we can use instead, going from vadd_f32 -> llvm.arm.mve.vadd -> MVE_VADD.

If this is acceptable, the other intrinsics I can add will be:
  fma
  fptoi, itofp
  trunc/round/ceil/etc
  fcmp
  minnum/maxnum
The current implementations assumes that the standard variant of a strictfp alternative will be a IRBuilder, this can be changed to take a IRBuilder or IRInt. There are also Neon intrinsics that AFAIU behave the same way.

>From 4d8d1a25b53cfaac8ab9c61fcd6aee54e40017cc Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Sat, 22 Nov 2025 09:39:57 +0000
Subject: [PATCH] [ARM] Introduce intrinsics for MVE add/sub/mul under
 strict-fp.

As far as I understand, the MVE fp vadd/vsub/vmul instructions will set
exception flags in the same ways as scalar fadd/fsub/fmul, but will not honor
flush-to-zero (for f32 they always flush, for f16 they follows the fpsrc flags)
and will always use the default rounding mode.

This means that we cannot convert the vadd_f23/vsub_f32/vmul_f32 intrinsics to
llvm.constrained.fadd/fsub/fmul, and the fadd/fsub/fmul without changing the
expected behaviour under strict-fp. This patch introduces a set in intrinsics
that we can use instead, going from vadd_f32 -> llvm.arm.mve.vadd -> MVE_VADD.

If this is acceptable, the other intrinsics I can add will be:
  fma
  fptoi, itofp
  trunc/round/ceil/etc
  fcmp
  minnum/maxnum
The current implementations assumes that the standard variant of a strictfp
alternative will be a IRBuilder, this can be changed to take a IRBuilder or
IRInt. There are also Neon intrinsics that AFAIU behave the same way.
---
 clang/include/clang/Basic/arm_mve_defs.td     |  21 +-
 clang/test/CodeGen/arm-mve-intrinsics/vaddq.c | 214 +++++++---
 clang/test/CodeGen/arm-mve-intrinsics/vmulq.c | 390 ++++++++++++------
 clang/test/CodeGen/arm-mve-intrinsics/vsubq.c | 214 +++++++---
 clang/utils/TableGen/MveEmitter.cpp           |  39 +-
 llvm/include/llvm/IR/IntrinsicsARM.td         |  12 +
 llvm/lib/Target/ARM/ARMInstrMVE.td            |  33 +-
 .../mve-intrinsics/strict-intrinsics.ll       | 143 +++++++
 .../CodeGen/Thumb2/mve-intrinsics/vabdq.ll    |   4 +-
 llvm/test/CodeGen/Thumb2/mve-pred-ext.ll      |  12 +-
 10 files changed, 797 insertions(+), 285 deletions(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/mve-intrinsics/strict-intrinsics.ll

diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index c1562a0c1f04c..eeca9153dd742 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -74,9 +74,9 @@ def immshr: CGHelperFn<"MVEImmediateShr"> {
   let special_params = [IRBuilderIntParam<1, "unsigned">,
                         IRBuilderIntParam<2, "bool">];
 }
-def fadd: IRBuilder<"CreateFAdd">;
-def fmul: IRBuilder<"CreateFMul">;
-def fsub: IRBuilder<"CreateFSub">;
+def fadd_node: IRBuilder<"CreateFAdd">;
+def fmul_node: IRBuilder<"CreateFMul">;
+def fsub_node: IRBuilder<"CreateFSub">;
 def load: IRBuilder<"CreateLoad"> {
   let special_params = [IRBuilderAddrParam<0>];
 }
@@ -212,6 +212,13 @@ def unsignedflag;
 // constant giving its size in bits.
 def bitsize;
 
+// strictFPAlt allows a node to have different code generation under strict-fp.
+// TODO: The standard node can be IRBuilderBase or IRIntBase.
+class strictFPAlt<IRBuilderBase standard_, IRIntBase strictfp_> {
+  IRBuilderBase standard = standard_;
+  IRIntBase strictfp = strictfp_;
+}
+
 // If you put CustomCodegen<"foo"> in an intrinsic's codegen field, it
 // indicates that the IR generation for that intrinsic is done by handwritten
 // C++ and not autogenerated at all. The effect in the MVE builtin codegen
@@ -573,6 +580,14 @@ multiclass IntrinsicMXNameOverride<Type rettype, dag arguments, dag cg,
   }
 }
 
+// StrictFP nodes that choose between standard fadd and llvm.arm.mve.fadd nodes
+// depending on whether we are using strict-fp.
+def fadd: strictFPAlt<fadd_node,
+                      IRInt<"vadd", [Vector]>>;
+def fsub: strictFPAlt<fsub_node,
+                      IRInt<"vsub", [Vector]>>;
+def fmul: strictFPAlt<fmul_node,
+                      IRInt<"vmul", [Vector]>>;
 
 // -----------------------------------------------------------------------------
 // Convenience lists of parameter types. 'T' is just a container record, so you
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
index 238cb4056d4f1..d24834951b385 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
@@ -1,6 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTRICT
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTRICT
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -frounding-math -fexperimental-strict-floating-point -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-STRICT
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -frounding-math -fexperimental-strict-floating-point -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-STRICT
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -20,10 +22,15 @@ uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fadd <8 x half> [[A:%.*]], [[B:%.*]]
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = fadd <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vadd.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]]) #[[ATTR2:[0-9]+]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b)
 {
@@ -34,12 +41,19 @@ float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_m_s8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_m_s8(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_m_s8(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
 {
@@ -50,12 +64,19 @@ int8x16_t test_vaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_m_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_m_f32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <4 x float> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_m_f32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vaddq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
 {
@@ -66,12 +87,19 @@ float32x4_t test_vaddq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b,
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_x_u16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_x_u16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_x_u16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vaddq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 {
@@ -82,12 +110,19 @@ uint16x8_t test_vaddq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_x_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_x_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_x_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vaddq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 {
@@ -114,12 +149,19 @@ uint32x4_t test_vaddq_n_u32(uint32x4_t a, uint32_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_n_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fadd <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_n_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = fadd <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_n_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vadd.v8f16(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vaddq_n_f16(float16x8_t a, float16_t b)
 {
@@ -130,14 +172,23 @@ float16x8_t test_vaddq_n_f16(float16x8_t a, float16_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_m_n_s8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_m_n_s8(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_m_n_s8(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
 {
@@ -148,14 +199,23 @@ int8x16_t test_vaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred1
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_m_n_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_m_n_f32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <4 x float> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_m_n_f32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vaddq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b, mve_pred16_t p)
 {
@@ -166,14 +226,23 @@ float32x4_t test_vaddq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b,
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_x_n_u16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_x_n_u16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_x_n_u16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vaddq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p)
 {
@@ -184,14 +253,23 @@ uint16x8_t test_vaddq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vaddq_x_n_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef)
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vaddq_x_n_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vaddq_x_n_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vaddq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
index 9d36a2d334860..708502ddb52fc 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
@@ -1,6 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTRICT
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTRICT
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -frounding-math -fexperimental-strict-floating-point -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-STRICT
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -frounding-math -fexperimental-strict-floating-point -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-STRICT
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -48,10 +50,15 @@ uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]]
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_f32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NONSTRICT-NEXT:    ret <4 x float> [[TMP0]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_f32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vmul.v4f32(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR2:[0-9]+]]
+// CHECK-STRICT-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b)
 {
@@ -62,12 +69,19 @@ float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_m_s8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_m_s8(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_m_s8(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vmulq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
 {
@@ -78,12 +92,19 @@ int8x16_t test_vmulq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_m_u16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_m_u16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_m_u16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vmulq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 {
@@ -94,12 +115,19 @@ uint16x8_t test_vmulq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_m_s32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_m_s32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <4 x i32> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_m_s32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vmulq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
 {
@@ -110,12 +138,19 @@ int32x4_t test_vmulq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pre
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_m_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_m_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_m_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vmulq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
 {
@@ -126,12 +161,19 @@ float16x8_t test_vmulq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b,
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_x_u8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_x_u8(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NONSTRICT-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_x_u8(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vmulq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 {
@@ -142,12 +184,19 @@ uint8x16_t test_vmulq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_x_s16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_x_s16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_x_s16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vmulq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 {
@@ -158,12 +207,19 @@ int16x8_t test_vmulq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_x_u32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_x_u32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NONSTRICT-NEXT:    ret <4 x i32> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_x_u32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vmulq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
 {
@@ -174,12 +230,19 @@ uint32x4_t test_vmulq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_x_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
-// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_x_f32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NONSTRICT-NEXT:    ret <4 x float> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_x_f32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vmulq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
 {
@@ -238,12 +301,19 @@ uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_n_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fmul <4 x float> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_n_f32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = fmul <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NONSTRICT-NEXT:    ret <4 x float> [[TMP0]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_n_f32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vmul.v4f32(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b)
 {
@@ -254,14 +324,23 @@ float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_m_n_s8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_m_n_s8(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_m_n_s8(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vmulq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
 {
@@ -272,14 +351,23 @@ int8x16_t test_vmulq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred1
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_m_n_u16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_m_n_u16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_m_n_u16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vmulq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
 {
@@ -290,14 +378,23 @@ uint16x8_t test_vmulq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_m_n_s32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_m_n_s32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <4 x i32> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_m_n_s32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vmulq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
 {
@@ -308,14 +405,23 @@ int32x4_t test_vmulq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pre
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_m_n_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_m_n_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_m_n_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vmulq_m_n_f16(float16x8_t inactive, float16x8_t a, float16_t b, mve_pred16_t p)
 {
@@ -326,14 +432,23 @@ float16x8_t test_vmulq_m_n_f16(float16x8_t inactive, float16x8_t a, float16_t b,
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_x_n_u8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> undef)
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_x_n_u8(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NONSTRICT-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_x_n_u8(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vmulq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p)
 {
@@ -344,14 +459,23 @@ uint8x16_t test_vmulq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_x_n_s16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_x_n_s16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_x_n_s16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vmulq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p)
 {
@@ -361,14 +485,23 @@ int16x8_t test_vmulq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p)
     return vmulq_x_n_s16(a, b, p);
 #endif /* POLYMORPHIC */
 }
-// CHECK-LABEL: @test_vmulq_x_n_u32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> undef)
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_x_n_u32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NONSTRICT-NEXT:    ret <4 x i32> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_x_n_u32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vmulq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
 {
@@ -379,14 +512,23 @@ uint32x4_t test_vmulq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_x_n_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> undef)
-// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vmulq_x_n_f32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NONSTRICT-NEXT:    ret <4 x float> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vmulq_x_n_f32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vmulq_x_n_f32(float32x4_t a, float32_t b, mve_pred16_t p)
 {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
index 3d87770889707..6f7b5a4f8271f 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
@@ -1,6 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTRICT
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTRICT
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -frounding-math -fexperimental-strict-floating-point -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-STRICT
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -frounding-math -fexperimental-strict-floating-point -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s --check-prefixes=CHECK,CHECK-STRICT
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -20,10 +22,15 @@ uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fsub <8 x half> [[A:%.*]], [[B:%.*]]
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = fsub <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vsub.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]]) #[[ATTR2:[0-9]+]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b)
 {
@@ -34,12 +41,19 @@ float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_m_s8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_m_s8(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_m_s8(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
 {
@@ -50,12 +64,19 @@ int8x16_t test_vsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_m_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_m_f32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <4 x float> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_m_f32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vsubq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
 {
@@ -66,12 +87,19 @@ float32x4_t test_vsubq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b,
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_x_u16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_x_u16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_x_u16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vsubq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 {
@@ -82,12 +110,19 @@ uint16x8_t test_vsubq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_x_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_x_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_x_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vsubq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 {
@@ -114,12 +149,19 @@ uint32x4_t test_vsubq_n_u32(uint32x4_t a, uint32_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_n_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fsub <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_n_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = fsub <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_n_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vsub.v8f16(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vsubq_n_f16(float16x8_t a, float16_t b)
 {
@@ -130,14 +172,23 @@ float16x8_t test_vsubq_n_f16(float16x8_t a, float16_t b)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_m_n_s8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_m_n_s8(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_m_n_s8(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
 {
@@ -148,14 +199,23 @@ int8x16_t test_vsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred1
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_m_n_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_m_n_f32(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NONSTRICT-NEXT:    ret <4 x float> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_m_n_f32(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vsubq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b, mve_pred16_t p)
 {
@@ -166,14 +226,23 @@ float32x4_t test_vsubq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b,
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_x_n_u16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_x_n_u16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_x_n_u16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vsubq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p)
 {
@@ -184,14 +253,23 @@ uint16x8_t test_vsubq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vsubq_x_n_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef)
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NONSTRICT-LABEL: @test_vsubq_x_n_f16(
+// CHECK-NONSTRICT-NEXT:  entry:
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-NONSTRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NONSTRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NONSTRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NONSTRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NONSTRICT-NEXT:    ret <8 x half> [[TMP2]]
+//
+// CHECK-STRICT-LABEL: @test_vsubq_x_n_f16(
+// CHECK-STRICT-NEXT:  entry:
+// CHECK-STRICT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
+// CHECK-STRICT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-STRICT-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-STRICT-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef) #[[ATTR2]]
+// CHECK-STRICT-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vsubq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 {
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index f55a5f54bd158..d2b366779ba16 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -757,6 +757,26 @@ class IRIntrinsicResult : public Result {
   }
 };
 
+// Result subclass that generates 
+// Builder.getIsFPConstrained() ? <Standard> : <StrictFp>
+class StrictFpAltResult : public Result {
+public:
+  Ptr Standard;
+  Ptr StrictFp;
+  StrictFpAltResult(Ptr Standard,Ptr StrictFp)
+      : Standard(Standard), StrictFp(StrictFp) {}
+  void genCode(raw_ostream &OS,
+               CodeGenParamAllocator &ParamAlloc) const override {
+    OS << "!Builder.getIsFPConstrained() ? ";
+    Standard->genCode(OS, ParamAlloc);
+    OS << " : ";
+    StrictFp->genCode(OS, ParamAlloc);
+  }
+  void morePrerequisites(std::vector<Ptr> &output) const override {
+    Standard->morePrerequisites(output);
+  }
+};
+
 // Result subclass that specifies a type, for use in IRBuilder operations such
 // as CreateBitCast that take a type argument.
 class TypeResult : public Result {
@@ -1239,7 +1259,8 @@ Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
     std::vector<Result::Ptr> Args;
     for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i)
       Args.push_back(getCodeForDagArg(D, i, Scope, Param));
-    if (Op->isSubClassOf("IRBuilderBase")) {
+
+    auto GenIRBuilderBase = [&](const Record *Op) {
       std::set<unsigned> AddressArgs;
       std::map<unsigned, std::string> IntegerArgs;
       for (const Record *sp : Op->getValueAsListOfDefs("special_params")) {
@@ -1252,7 +1273,8 @@ Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
       }
       return std::make_shared<IRBuilderResult>(Op->getValueAsString("prefix"),
                                                Args, AddressArgs, IntegerArgs);
-    } else if (Op->isSubClassOf("IRIntBase")) {
+    };
+    auto GenIRIntBase = [&](const Record *Op) {
       std::vector<const Type *> ParamTypes;
       for (const Record *RParam : Op->getValueAsListOfDefs("params"))
         ParamTypes.push_back(getType(RParam, Param));
@@ -1260,7 +1282,18 @@ Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
       if (Op->getValueAsBit("appendKind"))
         IntName += "_" + toLetter(cast<ScalarType>(Param)->kind());
       return std::make_shared<IRIntrinsicResult>(IntName, ParamTypes, Args);
-    } else {
+    };
+
+    if (Op->isSubClassOf("IRBuilderBase")) {
+      return GenIRBuilderBase(Op);
+    } else if (Op->isSubClassOf("IRIntBase")) {
+      return GenIRIntBase(Op);
+    } else if (Op->isSubClassOf("strictFPAlt")) {
+      auto Standard = GenIRBuilderBase(Op->getValueAsDef("standard"));
+      auto StrictFp = GenIRIntBase(Op->getValueAsDef("strictfp"));
+      return std::make_shared<StrictFpAltResult>(Standard, StrictFp);
+    }
+    else  {
       PrintFatalError("Unsupported dag node " + Op->getName());
     }
   }
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 1219ce2f86da8..ecadb235bec36 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1138,6 +1138,18 @@ def int_arm_mve_vshlc_predicated: DefaultAttrsIntrinsic<
    [llvm_i32_ty /* bits shifted out */, llvm_anyvector_ty],
    [LLVMMatchType<0>, llvm_i32_ty /* bits shifted in */,
     llvm_i32_ty /* shift count */, llvm_anyvector_ty], [IntrNoMem]>;
+def int_arm_mve_vadd: DefaultAttrsIntrinsic<
+   [llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_vsub: DefaultAttrsIntrinsic<
+   [llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_vmul: DefaultAttrsIntrinsic<
+   [llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>],
+   [IntrNoMem]>;
 def int_arm_mve_vmulh: DefaultAttrsIntrinsic<
    [llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 98591fa3f5bd7..6da04c4ac6f18 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -384,6 +384,16 @@ multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrin
                             (VTI.Vec MQPR:$inactive)))>;
 }
 
+def vadd : PatFrags<(ops node:$lhs, node:$rhs),
+                    [(fadd node:$lhs, node:$rhs),
+                     (int_arm_mve_vadd node:$lhs, node:$rhs)]>;
+def vsub : PatFrags<(ops node:$lhs, node:$rhs),
+                    [(fsub node:$lhs, node:$rhs),
+                     (int_arm_mve_vsub node:$lhs, node:$rhs)]>;
+def vmul : PatFrags<(ops node:$lhs, node:$rhs),
+                    [(fmul node:$lhs, node:$rhs),
+                     (int_arm_mve_vmul node:$lhs, node:$rhs)]>;
+
 // --------- Start of base classes for the instructions themselves
 
 class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
@@ -3605,7 +3615,7 @@ class MVE_VMUL_fp<string iname, string suffix, bits<2> size, list<dag> pattern=[
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDNode Op,
+multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDPatternOperator Op,
                           Intrinsic PredInt, SDPatternOperator IdentityVec> {
   def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
@@ -3616,7 +3626,7 @@ multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDNode Op,
 }
 
 multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
-  : MVE_VMULT_fp_m<"vmul", VTI, fmul, int_arm_mve_mul_predicated, IdentityVec>;
+  : MVE_VMULT_fp_m<"vmul", VTI, vmul, int_arm_mve_mul_predicated, IdentityVec>;
 
 def ARMimmOneF: PatLeaf<(bitconvert (v4f32 (ARMvmovFPImm (i32 112))))>; // 1.0 float
 def ARMimmOneH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2620))))>; // 1.0 half
@@ -3740,7 +3750,7 @@ defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>;
 defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
 
 multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
-                            SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
+                            SDPatternOperator Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
   def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size, 0, 1, bit_21> {
     let validForTailPredication = 1;
   }
@@ -3752,9 +3762,9 @@ multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
 }
 
 multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
-  : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated, IdentityVec>;
+  : MVE_VADDSUB_fp_m<"vadd", 0, VTI, vadd, int_arm_mve_add_predicated, IdentityVec>;
 multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
-  : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated, IdentityVec>;
+  : MVE_VADDSUB_fp_m<"vsub", 1, VTI, vsub, int_arm_mve_sub_predicated, IdentityVec>;
 
 def ARMimmMinusZeroF: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 1664))))>; // -0.0 float
 def ARMimmMinusZeroH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2688))))>; // -0.0 half
@@ -5391,21 +5401,22 @@ defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16, subnuw, ARMvshruImm>;
 defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32, subnuw, ARMvshruImm>;
 
 multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract,
-                            SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
+                            SDPatternOperator Op, Intrinsic PredInt,
+                            SDPatternOperator IdentityVec> {
   def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract, VTI.Size>;
   defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ),
                               !cast<Instruction>(NAME), IdentityVec>;
 }
 
 let Predicates = [HasMVEFloat] in {
-  defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, fadd,
+  defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, vadd,
                                           int_arm_mve_add_predicated, ARMimmMinusZeroF>;
-  defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, fadd,
+  defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, vadd,
                                           int_arm_mve_add_predicated, ARMimmMinusZeroH>;
 
-  defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, fsub,
+  defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, vsub,
                                           int_arm_mve_sub_predicated, ARMimmAllZerosV>;
-  defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, fsub,
+  defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, vsub,
                                           int_arm_mve_sub_predicated, ARMimmAllZerosV>;
 }
 
@@ -5588,7 +5599,7 @@ defm MVE_VQRDMULH_qr_s32  : MVE_VQRDMULH_qr_m<MVE_v4s32>;
 multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec> {
   let validForTailPredication = 1 in
   def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11, VTI.Size>;
-  defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ),
+  defm : MVE_TwoOpPatternDup<VTI, vmul, int_arm_mve_mul_predicated, (? ),
                              !cast<Instruction>(NAME), IdentityVec>;
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/strict-intrinsics.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/strict-intrinsics.ll
new file mode 100644
index 0000000000000..fa1ae5cf306d8
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/strict-intrinsics.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: test_vaddq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x half> @llvm.arm.mve.vadd.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vaddq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.arm.mve.vadd.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: test_vsubq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vsub.f16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x half> @llvm.arm.mve.vsub.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vsubq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vsub.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.arm.mve.vsub.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vmulq_f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: test_vmulq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.f16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x half> @llvm.arm.mve.vmul.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vmulq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.arm.mve.vmul.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %0
+}
+
+
+
+
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_f16_splat(<8 x half> %a, half %b) {
+; CHECK-LABEL: test_vaddq_f16_splat:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f16 r0, s4
+; CHECK-NEXT:    vadd.f16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x half> undef, half %b, i32 0
+  %s = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %0 = tail call <8 x half> @llvm.arm.mve.vadd.v8f16(<8 x half> %a, <8 x half> %s)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32_splat(<4 x float> %a, float %b) {
+; CHECK-LABEL: test_vaddq_f32_splat:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vadd.f32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %b, i32 0
+  %s = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.arm.mve.vadd.v4f32(<4 x float> %a, <4 x float> %s)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16_splat(<8 x half> %a, half %b) {
+; CHECK-LABEL: test_vsubq_f16_splat:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f16 r0, s4
+; CHECK-NEXT:    vsub.f16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x half> undef, half %b, i32 0
+  %s = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %0 = tail call <8 x half> @llvm.arm.mve.vsub.v8f16(<8 x half> %a, <8 x half> %s)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vsubq_f32_splat(<4 x float> %a, float %b) {
+; CHECK-LABEL: test_vsubq_f32_splat:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vsub.f32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %b, i32 0
+  %s = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.arm.mve.vsub.v4f32(<4 x float> %a, <4 x float> %s)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vmulq_f16_splat(<8 x half> %a, half %b) {
+; CHECK-LABEL: test_vmulq_f16_splat:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f16 r0, s4
+; CHECK-NEXT:    vmul.f16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x half> undef, half %b, i32 0
+  %s = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %0 = tail call <8 x half> @llvm.arm.mve.vmul.v8f16(<8 x half> %a, <8 x half> %s)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32_splat(<4 x float> %a, float %b) {
+; CHECK-LABEL: test_vmulq_f32_splat:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmul.f32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %b, i32 0
+  %s = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.arm.mve.vmul.v4f32(<4 x float> %a, <4 x float> %s)
+  ret <4 x float> %0
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
index 961489613b626..389ccd3491539 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
@@ -25,8 +25,8 @@ entry:
 
 declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>, i32) #1
 
-define arm_aapcs_vfpcc <8 x half> @test_vabdq_f32(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
-; CHECK-LABEL: test_vabdq_f32:
+define arm_aapcs_vfpcc <8 x half> @test_vabdq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vabdq_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vabd.f16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
index 0bec2b100911c..1aea7f9849b89 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
@@ -63,8 +63,8 @@ entry:
   ret <8 x i16> %0
 }
 
-define arm_aapcs_vfpcc <8 x i16> @sext_v8i1_v8f32(<8 x half> %src1, <8 x half> %src2) {
-; CHECK-MVE-LABEL: sext_v8i1_v8f32:
+define arm_aapcs_vfpcc <8 x i16> @sext_v8i1_v8f16(<8 x half> %src1, <8 x half> %src2) {
+; CHECK-MVE-LABEL: sext_v8i1_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    .save {r4, lr}
 ; CHECK-MVE-NEXT:    push {r4, lr}
@@ -110,7 +110,7 @@ define arm_aapcs_vfpcc <8 x i16> @sext_v8i1_v8f32(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    vmov.16 q0[7], r0
 ; CHECK-MVE-NEXT:    pop {r4, pc}
 ;
-; CHECK-MVEFP-LABEL: sext_v8i1_v8f32:
+; CHECK-MVEFP-LABEL: sext_v8i1_v8f16:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
 ; CHECK-MVEFP-NEXT:    vmov.i16 q2, #0x0
 ; CHECK-MVEFP-NEXT:    vmov.i8 q3, #0xff
@@ -291,8 +291,8 @@ entry:
   ret <8 x i16> %0
 }
 
-define arm_aapcs_vfpcc <8 x i16> @zext_v8i1_v8f32(<8 x half> %src1, <8 x half> %src2) {
-; CHECK-MVE-LABEL: zext_v8i1_v8f32:
+define arm_aapcs_vfpcc <8 x i16> @zext_v8i1_v8f16(<8 x half> %src1, <8 x half> %src2) {
+; CHECK-MVE-LABEL: zext_v8i1_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    .save {r4, lr}
 ; CHECK-MVE-NEXT:    push {r4, lr}
@@ -340,7 +340,7 @@ define arm_aapcs_vfpcc <8 x i16> @zext_v8i1_v8f32(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    vand q0, q1, q0
 ; CHECK-MVE-NEXT:    pop {r4, pc}
 ;
-; CHECK-MVEFP-LABEL: zext_v8i1_v8f32:
+; CHECK-MVEFP-LABEL: zext_v8i1_v8f16:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
 ; CHECK-MVEFP-NEXT:    vmov.i16 q2, #0x0
 ; CHECK-MVEFP-NEXT:    vmov.i16 q3, #0x1