[llvm] a12f588 - [ARM, MVE] Add intrinsics for contiguous load/stores.

Wed Nov 13 04:48:07 PST 2019

Author: Simon Tatham
Date: 2019-11-13T12:47:00Z
New Revision: a12f588ebb1a56bf7028d963fc1bdbd3229f5f5c

URL: https://github.com/llvm/llvm-project/commit/a12f588ebb1a56bf7028d963fc1bdbd3229f5f5c
DIFF: https://github.com/llvm/llvm-project/commit/a12f588ebb1a56bf7028d963fc1bdbd3229f5f5c.diff

LOG: [ARM,MVE] Add intrinsics for contiguous load/stores.

This patch adds the ACLE intrinsics for all the MVE load and store
instructions not already handled by D69791. These ones don't need new
IR intrinsics, because they can be implemented in terms of standard
LLVM IR constructions.

Some of the load and store instructions access less than 128 bits of
memory, sign/zero extending each value to a wider vector lane on load
or truncating it on store. These are represented in IR by a load of a
shorter vector followed by a zext/sext, and conversely, a trunc
followed by a short store. Existing ISel patterns already recognize
those combinations and turn them into the right MVE instructions.

The predicated forms of all these instructions are represented in the
same way, except that the ordinary load/store operation is replaced
with the existing intrinsics @llvm.masked.{load,store}. These are
currently only code-generated as predicated MVE load/store
instructions if you give LLVM the `-enable-arm-maskedldst` option; so
I've done that in the LLVM codegen test. When we make that the
default, that option can be removed.

In the Tablegen backend, I've had to add a handful of extra support
features:

* We need to be able to make clang::Address objects out of a
  pointer and an alignment (previously we only needed these when the
  user passed us an existing one).

* We can now specify vector types that aren't 128 bits wide (for use
  in those intermediate values in IR), the parametrized type system
  can make one starting from two existing vector types (using the lane
  count of one and the element type of the other).

* I've added support for code generation of pointer casts, and for
  specifying LLVM types as operands to IRBuilder operations (for zext
  and sext, though I think they'll come in useful again).

* Now not all IR construction operations need to be specified as
  Builder.CreateFoo; some don't involve a Builder at all, and one
  passes it as a parameter to a tiny static helper function in
  CGBuiltin.cpp.

Reviewers: ostannard, MarkMurrayARM, dmgreen

Subscribers: kristof.beyls, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D70088

Added: 
    clang/test/CodeGen/arm-mve-intrinsics/load-store.c
    llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll

Modified: 
    clang/include/clang/Basic/arm_mve.td
    clang/include/clang/Basic/arm_mve_defs.td
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/utils/TableGen/MveEmitter.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 6e0e8ce4e5ef..d2f877dda28e 100644

--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -72,6 +72,124 @@ def vcvt#half#q_m_f16: Intrinsic<
 
 } // loop over half = "b", "t"
 
+multiclass contiguous_load<string mnemonic, PrimitiveType memtype,
+                           list<Type> same_size, list<Type> wider> {
+  // Intrinsics named with explicit memory and element sizes that match:
+  // vldrbq_?8, vldrhq_?16, vldrwq_?32.
+  let params = same_size, pnt = PNT_None in {
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr),
+                   (load (address (CPtr<Vector> $addr), !srl(memtype.size,3)))>,
+         NameOverride<mnemonic>;
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr,
+                                 Predicate:$pred),
+                   (IRIntBase<"masked_load", [Vector, CPtr<Vector>]>
+                        (CPtr<Vector> $addr), !srl(memtype.size,3),
+                        $pred, (zeroinit Vector))>,
+         NameOverride<mnemonic # "_z">;
+  }
+
+  // Synonyms for the above, with the generic name vld1q that just means
+  // 'memory and element sizes match', and allows convenient polymorphism with
+  // the memory and element types covariant.
+  let params = same_size in {
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr),
+                   (load (address (CPtr<Vector> $addr), !srl(memtype.size,3)))>,
+         NameOverride<"vld1q">;
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr,
+                                 Predicate:$pred),
+                   (IRIntBase<"masked_load", [Vector, CPtr<Vector>]>
+                        (CPtr<Vector> $addr), !srl(memtype.size,3),
+                        $pred, (zeroinit Vector))>,
+         NameOverride<"vld1q_z">;
+  }
+
+  // Intrinsics with the memory size narrower than the vector element, so that
+  // they load less than 128 bits of memory and sign/zero extend each loaded
+  // value into a wider vector lane.
+  let params = wider, pnt = PNT_None in {
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr),
+                   (extend (load (address (CPtr<NarrowedVecOf<memtype,Vector>>
+                                           $addr), !srl(memtype.size,3))),
+                           Vector, (unsignedflag Scalar))>,
+         NameOverride<mnemonic>;
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr,
+                                 Predicate:$pred),
+                   (extend (IRIntBase<"masked_load",
+                                      [NarrowedVecOf<memtype,Vector>,
+                                      CPtr<NarrowedVecOf<memtype,Vector>>]>
+                                (CPtr<NarrowedVecOf<memtype,Vector>> $addr),
+                                !srl(memtype.size,3), $pred,
+                                (zeroinit NarrowedVecOf<memtype,Vector>)),
+                           Vector, (unsignedflag Scalar))>,
+         NameOverride<mnemonic # "_z">;
+  }
+}
+
+defm: contiguous_load<"vldrbq", u8, T.All8, !listconcat(T.Int16, T.Int32)>;
+defm: contiguous_load<"vldrhq", u16, T.All16, T.Int32>;
+defm: contiguous_load<"vldrwq", u32, T.All32, []>;
+
+multiclass contiguous_store<string mnemonic, PrimitiveType memtype,
+                           list<Type> same_size, list<Type> wider> {
+  // Intrinsics named with explicit memory and element sizes that match:
+  // vstrbq_?8, vstrhq_?16, vstrwq_?32.
+  let params = same_size in {
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value),
+                   (store $value,
+                          (address (Ptr<Vector> $addr), !srl(memtype.size,3)))>,
+         NameOverride<mnemonic>;
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value, Predicate:$pred),
+                   (IRIntBase<"masked_store", [Vector, Ptr<Vector>]>
+                        $value, (Ptr<Vector> $addr),
+                        !srl(memtype.size,3), $pred)>,
+         NameOverride<mnemonic # "_p">;
+  }
+
+  // Synonyms for the above, with the generic name vst1q that just means
+  // 'memory and element sizes match', and allows convenient polymorphism with
+  // the memory and element types covariant.
+  let params = same_size in {
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value),
+                   (store $value,
+                          (address (Ptr<Vector> $addr), !srl(memtype.size,3)))>,
+         NameOverride<"vst1q">;
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value, Predicate:$pred),
+                   (IRIntBase<"masked_store", [Vector, Ptr<Vector>]>
+                        $value, (Ptr<Vector> $addr),
+                        !srl(memtype.size,3), $pred)>,
+         NameOverride<"vst1q_p">;
+  }
+
+  // Intrinsics with the memory size narrower than the vector element, so that
+  // they store less than 128 bits of memory, truncating each vector lane into
+  // a narrower value to store.
+  let params = wider in {
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value),
+                   (store (trunc $value, NarrowedVecOf<memtype,Vector>),
+                          (address (Ptr<NarrowedVecOf<memtype,Vector>> $addr),
+                                   !srl(memtype.size,3)))>,
+         NameOverride<mnemonic>;
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value, Predicate:$pred),
+                   (IRIntBase<"masked_store",
+                              [NarrowedVecOf<memtype,Vector>,
+                               Ptr<NarrowedVecOf<memtype,Vector>>]>
+                        (trunc $value, NarrowedVecOf<memtype,Vector>),
+                        (Ptr<NarrowedVecOf<memtype,Vector>> $addr),
+                        !srl(memtype.size,3), $pred)>,
+         NameOverride<mnemonic # "_p">;
+  }
+}
+
+defm: contiguous_store<"vstrbq", u8, T.All8, !listconcat(T.Int16, T.Int32)>;
+defm: contiguous_store<"vstrhq", u16, T.All16, T.Int32>;
+defm: contiguous_store<"vstrwq", u32, T.All32, []>;
+
 multiclass gather_base<list<Type> types, int size> {
   let params = types, pnt = PNT_None in {
     def _gather_base: Intrinsic<

diff  --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index 3d9333f3d445..da6928fc137a 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -28,13 +28,31 @@ def args;
 
 // -----------------------------------------------------------------------------
 // Family of nodes for use in the codegen dag for an intrinsic, corresponding
-// roughly to operations in LLVM IR. More precisely, they correspond to calls
-// to methods of llvm::IRBuilder.
-class IRBuilder<string func_> {
-  string func = func_;          // the method name
+// to function calls that return LLVM IR nodes.
+class IRBuilderBase {
+  // The prefix of the function call, including an open parenthesis.
+  string prefix;
+
+  // Any parameters that have types that have to be treated specially by the
+  // Tablegen back end. Generally these will be types other than llvm::Value *,
+  // although not all other types need special treatment (e.g. llvm::Type *).
   list<int> address_params = []; // indices of parameters with type Address
   list<int> int_constant_params = []; // indices of plain integer parameters
 }
+class IRBuilder<string func> : IRBuilderBase {
+  // The usual case: a method called on the code gen function's instance of
+  // llvm::IRBuilder.
+  let prefix = "Builder." # func # "(";
+}
+class IRFunction<string func> : IRBuilderBase {
+  // Some other function that doesn't use the IRBuilder at all.
+  let prefix = func # "(";
+}
+class CGHelperFn<string func> : IRBuilderBase {
+  // A helper function defined in CGBuiltin.cpp, which takes the IRBuilder as
+  // an argument.
+  let prefix = func # "(Builder, ";
+}
 def add: IRBuilder<"CreateAdd">;
 def or: IRBuilder<"CreateOr">;
 def and: IRBuilder<"CreateAnd">;
@@ -46,12 +64,19 @@ def fsub: IRBuilder<"CreateFSub">;
 def load: IRBuilder<"CreateLoad"> { let address_params = [0]; }
 def store: IRBuilder<"CreateStore"> { let address_params = [1]; }
 def xval: IRBuilder<"CreateExtractValue"> { let int_constant_params = [1]; }
+def trunc: IRBuilder<"CreateTrunc">;
+def extend: CGHelperFn<"SignOrZeroExtend"> { let int_constant_params = [2]; }
+def zeroinit: IRFunction<"llvm::Constant::getNullValue">;
+
+// A node that makes an Address out of a pointer-typed Value, by
+// providing an alignment as the second argument.
+def address;
 
 // Another node class you can use in the codegen dag. This one corresponds to
 // an IR intrinsic function, which has to be specialized to a particular list
 // of types.
-class IRInt<string name_, list<Type> params_ = [], bit appendKind_ = 0> {
-  string intname = name_;       // base name of the intrinsic, minus "arm_mve_"
+class IRIntBase<string name_, list<Type> params_ = [], bit appendKind_ = 0> {
+  string intname = name_;       // base name of the intrinsic
   list<Type> params = params_;  // list of parameter types
 
   // If this flag is set, then the IR intrinsic name will get a suffix _s, _u
@@ -67,6 +92,11 @@ class IRInt<string name_, list<Type> params_ = [], bit appendKind_ = 0> {
   bit appendKind = appendKind_;
 }
 
+// Mostly we'll be using @llvm.arm.mve.* intrinsics, so here's a trivial
+// subclass that puts on that prefix.
+class IRInt<string name, list<Type> params = [], bit appendKind = 0>
+      : IRIntBase<"arm_mve_" # name, params, appendKind>;
+
 // The 'seq' node in a codegen dag specifies a set of IR operations to be
 // performed in order. It has the special ability to define extra variable
 // names, on top of the ones that refer to the intrinsic's parameters. For
@@ -151,6 +181,14 @@ def sint: PrimitiveType<"s", 32> { let nameOverride = "int"; }
 // is.
 class VecOf<Type t>: ComplexType<(CTO_Vec t)>;
 
+// NarrowedVecOf<t,v> expects t to be a scalar type, and v to be a vector
+// type. It returns a vector type whose element type is t, and whose lane
+// count is the same as the lane count of v. (Used as an intermediate value
+// type in the IR representation of a widening load: you load a vector of
+// small things out of memory, and then zext/sext them into a full 128-bit
+// output vector.)
+class NarrowedVecOf<Type t, Type v>: ComplexType<(CTO_Vec t, v)>;
+
 // PredOf expects t to be a scalar, and expands to a predicate vector which
 // (logically speaking) has the same number of lanes as VecOf<t> would.
 class PredOf<Type t>: ComplexType<(CTO_Pred t)>;

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 0a16636c98f8..516add2e130e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6788,6 +6788,13 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
   }
 }
 
+static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V,
+                                     llvm::Type *T, bool Unsigned) {
+  // Helper function called by Tablegen-constructed ARM MVE builtin codegen,
+  // which finds it convenient to specify signed/unsigned as a boolean flag.
+  return Unsigned ? Builder.CreateZExt(V, T) : Builder.CreateSExt(V, T);
+}
+
 Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID,
                                               const CallExpr *E,
                                               ReturnValueSlot ReturnValue,

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/load-store.c b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
new file mode 100644
index 000000000000..5cbf6a3128c0
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
@@ -0,0 +1,1325 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vld1q_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, <8 x half>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+//
+float16x8_t test_vld1q_f16(const float16_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_f16(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+//
+float32x4_t test_vld1q_f32(const float32_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_f32(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+int8x16_t test_vld1q_s8(const int8_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_s8(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+int16x8_t test_vld1q_s16(const int16_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_s16(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+int32x4_t test_vld1q_s32(const int32_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_s32(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+uint8x16_t test_vld1q_u8(const uint8_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_u8(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+uint16x8_t test_vld1q_u16(const uint16_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_u16(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+uint32x4_t test_vld1q_u32(const uint32_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_u32(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
+float16x8_t test_vld1q_z_f16(const float16_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_f16(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer)
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
+float32x4_t test_vld1q_z_f32(const float32_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_f32(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+int8x16_t test_vld1q_z_s8(const int8_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_s8(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+int16x8_t test_vld1q_z_s16(const int16_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_s16(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
+int32x4_t test_vld1q_z_s32(const int32_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_s32(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+uint8x16_t test_vld1q_z_u8(const uint8_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_u8(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+uint16x8_t test_vld1q_z_u16(const uint16_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_u16(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
+uint32x4_t test_vld1q_z_u32(const uint32_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_u32(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vldrbq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+int8x16_t test_vldrbq_s8(const int8_t *base)
+{
+    return vldrbq_s8(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vldrbq_s16(const int8_t *base)
+{
+    return vldrbq_s16(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vldrbq_s32(const int8_t *base)
+{
+    return vldrbq_s32(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+uint8x16_t test_vldrbq_u8(const uint8_t *base)
+{
+    return vldrbq_u8(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vldrbq_u16(const uint8_t *base)
+{
+    return vldrbq_u16(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vldrbq_u32(const uint8_t *base)
+{
+    return vldrbq_u32(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+int8x16_t test_vldrbq_z_s8(const int8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_s8(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP0]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+//
+int16x8_t test_vldrbq_z_s16(const int8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_s16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP0]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
+int32x4_t test_vldrbq_z_s32(const int8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_s32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+uint8x16_t test_vldrbq_z_u8(const uint8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_u8(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP0]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+//
+uint16x8_t test_vldrbq_z_u16(const uint8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_u16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP0]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
+uint32x4_t test_vldrbq_z_u32(const uint8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_u32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, <8 x half>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+//
+float16x8_t test_vldrhq_f16(const float16_t *base)
+{
+    return vldrhq_f16(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+int16x8_t test_vldrhq_s16(const int16_t *base)
+{
+    return vldrhq_s16(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vldrhq_s32(const int16_t *base)
+{
+    return vldrhq_s32(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+uint16x8_t test_vldrhq_u16(const uint16_t *base)
+{
+    return vldrhq_u16(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vldrhq_u32(const uint16_t *base)
+{
+    return vldrhq_u32(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
+float16x8_t test_vldrhq_z_f16(const float16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_f16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+int16x8_t test_vldrhq_z_s16(const int16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_s16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP0]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
+int32x4_t test_vldrhq_z_s32(const int16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_s32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+uint16x8_t test_vldrhq_z_u16(const uint16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_u16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP0]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
+uint32x4_t test_vldrhq_z_u32(const uint16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_u32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrwq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+//
+float32x4_t test_vldrwq_f32(const float32_t *base)
+{
+    return vldrwq_f32(base);
+}
+
+// CHECK-LABEL: @test_vldrwq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+int32x4_t test_vldrwq_s32(const int32_t *base)
+{
+    return vldrwq_s32(base);
+}
+
+// CHECK-LABEL: @test_vldrwq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+uint32x4_t test_vldrwq_u32(const uint32_t *base)
+{
+    return vldrwq_u32(base);
+}
+
+// CHECK-LABEL: @test_vldrwq_z_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer)
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
+float32x4_t test_vldrwq_z_f32(const float32_t *base, mve_pred16_t p)
+{
+    return vldrwq_z_f32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrwq_z_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
+int32x4_t test_vldrwq_z_s32(const int32_t *base, mve_pred16_t p)
+{
+    return vldrwq_z_s32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrwq_z_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
+uint32x4_t test_vldrwq_z_u32(const uint32_t *base, mve_pred16_t p)
+{
+    return vldrwq_z_u32(base, p);
+}
+
+// CHECK-LABEL: @test_vst1q_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    store <8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f16(float16_t *base, float16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_f16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    store <4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f32(float32_t *base, float32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_f32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s8(int8_t *base, int8x16_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_s8(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s16(int16_t *base, int16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_s16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s32(int32_t *base, int32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_s32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u8(uint8_t *base, uint8x16_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_u8(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u16(uint16_t *base, uint16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_u16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u32(uint32_t *base, uint32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_u32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_f16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_f32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_s8(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_s16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_s32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_u8(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_u16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_u32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_s8(int8_t *base, int8x16_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_s8(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    store <8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_s16(int8_t *base, int16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_s16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    store <4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_s32(int8_t *base, int32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_s32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_u8(uint8_t *base, uint8x16_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_u8(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    store <8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_u16(uint8_t *base, uint16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_u16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    store <4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_u32(uint8_t *base, uint32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_u32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_s8(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], i32 1, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_s16(int8_t *base, int16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_s16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], i32 1, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_s32(int8_t *base, int32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_s32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_u8(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], i32 1, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_u16(uint8_t *base, uint16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_u16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], i32 1, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_u32(uint8_t *base, uint32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_u32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    store <8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_f16(float16_t *base, float16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_f16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_s16(int16_t *base, int16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_s16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_s32(int16_t *base, int32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_s32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_u16(uint16_t *base, uint16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_u16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_u32(uint16_t *base, uint32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_u32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_f16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_s16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], i32 2, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_s32(int16_t *base, int32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_s32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_u16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], i32 2, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_u32(uint16_t *base, uint32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_u32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    store <4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_f32(float32_t *base, float32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrwq(base, value);
+#else /* POLYMORPHIC */
+    vstrwq_f32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_s32(int32_t *base, int32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrwq(base, value);
+#else /* POLYMORPHIC */
+    vstrwq_s32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_u32(uint32_t *base, uint32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrwq(base, value);
+#else /* POLYMORPHIC */
+    vstrwq_u32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrwq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrwq_p_f32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrwq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrwq_p_s32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrwq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrwq_p_u32(base, value, p);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index aa3b475ea7b7..2941cb6ec088 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -283,12 +283,9 @@ class VectorType : public CRegularNamedType {
   unsigned Lanes;
 
 public:
-  VectorType(const ScalarType *Element)
-      : CRegularNamedType(TypeKind::Vector), Element(Element) {
-    // MVE has a fixed 128-bit vector size
-    Lanes = 128 / Element->sizeInBits();
-  }
-  unsigned sizeInBits() const override { return 128; }
+  VectorType(const ScalarType *Element, unsigned Lanes)
+      : CRegularNamedType(TypeKind::Vector), Element(Element), Lanes(Lanes) {}
+  unsigned sizeInBits() const override { return Lanes * Element->sizeInBits(); }
   unsigned lanes() const { return Lanes; }
   bool requiresFloat() const override { return Element->requiresFloat(); }
   std::string cNameBase() const override {
@@ -609,24 +606,41 @@ class IntCastResult : public Result {
   }
 };
 
+// Result subclass representing a cast between 
diff erent pointer types.
+class PointerCastResult : public Result {
+public:
+  const PointerType *PtrType;
+  Ptr V;
+  PointerCastResult(const PointerType *PtrType, Ptr V)
+      : PtrType(PtrType), V(V) {}
+  void genCode(raw_ostream &OS,
+               CodeGenParamAllocator &ParamAlloc) const override {
+    OS << "Builder.CreatePointerCast(" << V->asValue() << ", "
+       << ParamAlloc.allocParam("llvm::Type *", PtrType->llvmName()) << ")";
+  }
+  void morePrerequisites(std::vector<Ptr> &output) const override {
+    output.push_back(V);
+  }
+};
+
 // Result subclass representing a call to an IRBuilder method. Each IRBuilder
 // method we want to use will have a Tablegen record giving the method name and
 // describing any important details of how to call it, such as whether a
 // particular argument should be an integer constant instead of an llvm::Value.
 class IRBuilderResult : public Result {
 public:
-  StringRef BuilderMethod;
+  StringRef CallPrefix;
   std::vector<Ptr> Args;
   std::set<unsigned> AddressArgs;
   std::set<unsigned> IntConstantArgs;
-  IRBuilderResult(StringRef BuilderMethod, std::vector<Ptr> Args,
+  IRBuilderResult(StringRef CallPrefix, std::vector<Ptr> Args,
                   std::set<unsigned> AddressArgs,
                   std::set<unsigned> IntConstantArgs)
-      : BuilderMethod(BuilderMethod), Args(Args), AddressArgs(AddressArgs),
+    : CallPrefix(CallPrefix), Args(Args), AddressArgs(AddressArgs),
         IntConstantArgs(IntConstantArgs) {}
   void genCode(raw_ostream &OS,
                CodeGenParamAllocator &ParamAlloc) const override {
-    OS << "Builder." << BuilderMethod << "(";
+    OS << CallPrefix;
     const char *Sep = "";
     for (unsigned i = 0, e = Args.size(); i < e; ++i) {
       Ptr Arg = Args[i];
@@ -652,6 +666,25 @@ class IRBuilderResult : public Result {
   }
 };
 
+// Result subclass representing making an Address out of a Value.
+class AddressResult : public Result {
+public:
+  Ptr Arg;
+  unsigned Align;
+  AddressResult(Ptr Arg, unsigned Align) : Arg(Arg), Align(Align) {}
+  void genCode(raw_ostream &OS,
+               CodeGenParamAllocator &ParamAlloc) const override {
+    OS << "Address(" << Arg->varname() << ", CharUnits::fromQuantity("
+       << Align << "))";
+  }
+  std::string typeName() const override {
+    return "Address";
+  }
+  void morePrerequisites(std::vector<Ptr> &output) const override {
+    output.push_back(Arg);
+  }
+};
+
 // Result subclass representing a call to an IR intrinsic, which we first have
 // to look up using an Intrinsic::ID constant and an array of types.
 class IRIntrinsicResult : public Result {
@@ -665,7 +698,7 @@ class IRIntrinsicResult : public Result {
   void genCode(raw_ostream &OS,
                CodeGenParamAllocator &ParamAlloc) const override {
     std::string IntNo = ParamAlloc.allocParam(
-        "Intrinsic::ID", "Intrinsic::arm_mve_" + IntrinsicID);
+        "Intrinsic::ID", "Intrinsic::" + IntrinsicID);
     OS << "Builder.CreateCall(CGM.getIntrinsic(" << IntNo;
     if (!ParamTypes.empty()) {
       OS << ", llvm::SmallVector<llvm::Type *, " << ParamTypes.size() << "> {";
@@ -689,6 +722,20 @@ class IRIntrinsicResult : public Result {
   }
 };
 
+// Result subclass that specifies a type, for use in IRBuilder operations such
+// as CreateBitCast that take a type argument.
+class TypeResult : public Result {
+public:
+  const Type *T;
+  TypeResult(const Type *T) : T(T) {}
+  void genCode(raw_ostream &OS, CodeGenParamAllocator &) const override {
+    OS << T->llvmName();
+  }
+  std::string typeName() const override {
+    return "llvm::Type *";
+  }
+};
+
 // -----------------------------------------------------------------------------
 // Class that describes a single ACLE intrinsic.
 //
@@ -852,7 +899,8 @@ class MveEmitter {
   // MveEmitter holds a collection of all the types we've instantiated.
   VoidType Void;
   std::map<std::string, std::unique_ptr<ScalarType>> ScalarTypes;
-  std::map<std::pair<ScalarTypeKind, unsigned>, std::unique_ptr<VectorType>>
+  std::map<std::tuple<ScalarTypeKind, unsigned, unsigned>,
+           std::unique_ptr<VectorType>>
       VectorTypes;
   std::map<std::pair<std::string, unsigned>, std::unique_ptr<MultiVectorType>>
       MultiVectorTypes;
@@ -872,12 +920,16 @@ class MveEmitter {
   const ScalarType *getScalarType(Record *R) {
     return getScalarType(R->getName());
   }
-  const VectorType *getVectorType(const ScalarType *ST) {
-    std::pair<ScalarTypeKind, unsigned> key(ST->kind(), ST->sizeInBits());
+  const VectorType *getVectorType(const ScalarType *ST, unsigned Lanes) {
+    std::tuple<ScalarTypeKind, unsigned, unsigned> key(ST->kind(),
+                                                       ST->sizeInBits(), Lanes);
     if (VectorTypes.find(key) == VectorTypes.end())
-      VectorTypes[key] = std::make_unique<VectorType>(ST);
+      VectorTypes[key] = std::make_unique<VectorType>(ST, Lanes);
     return VectorTypes[key].get();
   }
+  const VectorType *getVectorType(const ScalarType *ST) {
+    return getVectorType(ST, 128 / ST->sizeInBits());
+  }
   const MultiVectorType *getMultiVectorType(unsigned Registers,
                                             const VectorType *VT) {
     std::pair<std::string, unsigned> key(VT->cNameBase(), Registers);
@@ -969,7 +1021,13 @@ const Type *MveEmitter::getType(DagInit *D, const Type *Param) {
 
   if (Op->getName() == "CTO_Vec") {
     const Type *Element = getType(D->getArg(0), Param);
-    return getVectorType(cast<ScalarType>(Element));
+    if (D->getNumArgs() == 1) {
+      return getVectorType(cast<ScalarType>(Element));
+    } else {
+      const Type *ExistingVector = getType(D->getArg(1), Param);
+      return getVectorType(cast<ScalarType>(Element),
+                           cast<VectorType>(ExistingVector)->lanes());
+    }
   }
 
   if (Op->getName() == "CTO_Pred") {
@@ -1035,8 +1093,21 @@ Result::Ptr MveEmitter::getCodeForDag(DagInit *D, const Result::Scope &Scope,
         else
           return std::make_shared<IntCastResult>(ST, Arg);
       }
+    } else if (const auto *PT = dyn_cast<PointerType>(CastType)) {
+      return std::make_shared<PointerCastResult>(PT, Arg);
     }
     PrintFatalError("Unsupported type cast");
+  } else if (Op->getName() == "address") {
+    if (D->getNumArgs() != 2)
+      PrintFatalError("'address' should have two arguments");
+    Result::Ptr Arg = getCodeForDagArg(D, 0, Scope, Param);
+    unsigned Alignment;
+    if (auto *II = dyn_cast<IntInit>(D->getArg(1))) {
+      Alignment = II->getValue();
+    } else {
+      PrintFatalError("'address' alignment argument should be an integer");
+    }
+    return std::make_shared<AddressResult>(Arg, Alignment);
   } else if (Op->getName() == "unsignedflag") {
     if (D->getNumArgs() != 1)
       PrintFatalError("unsignedflag should have exactly one argument");
@@ -1053,7 +1124,7 @@ Result::Ptr MveEmitter::getCodeForDag(DagInit *D, const Result::Scope &Scope,
     std::vector<Result::Ptr> Args;
     for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i)
       Args.push_back(getCodeForDagArg(D, i, Scope, Param));
-    if (Op->isSubClassOf("IRBuilder")) {
+    if (Op->isSubClassOf("IRBuilderBase")) {
       std::set<unsigned> AddressArgs;
       for (unsigned i : Op->getValueAsListOfInts("address_params"))
         AddressArgs.insert(i);
@@ -1061,8 +1132,8 @@ Result::Ptr MveEmitter::getCodeForDag(DagInit *D, const Result::Scope &Scope,
       for (unsigned i : Op->getValueAsListOfInts("int_constant_params"))
         IntConstantArgs.insert(i);
       return std::make_shared<IRBuilderResult>(
-          Op->getValueAsString("func"), Args, AddressArgs, IntConstantArgs);
-    } else if (Op->isSubClassOf("IRInt")) {
+          Op->getValueAsString("prefix"), Args, AddressArgs, IntConstantArgs);
+    } else if (Op->isSubClassOf("IRIntBase")) {
       std::vector<const Type *> ParamTypes;
       for (Record *RParam : Op->getValueAsListOfDefs("params"))
         ParamTypes.push_back(getType(RParam, Param));
@@ -1099,6 +1170,14 @@ Result::Ptr MveEmitter::getCodeForDagArg(DagInit *D, unsigned ArgNum,
   if (auto *DI = dyn_cast<DagInit>(Arg))
     return getCodeForDag(DI, Scope, Param);
 
+  if (auto *DI = dyn_cast<DefInit>(Arg)) {
+    Record *Rec = DI->getDef();
+    if (Rec->isSubClassOf("Type")) {
+      const Type *T = getType(Rec, Param);
+      return std::make_shared<TypeResult>(T);
+    }
+  }
+
   PrintFatalError("bad dag argument type for code generation");
 }
 
@@ -1111,8 +1190,9 @@ Result::Ptr MveEmitter::getCodeForArg(unsigned ArgNum, const Type *ArgType) {
       V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
   } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
     V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-    V = std::make_shared<IRIntrinsicResult>(
-        "pred_i2v", std::vector<const Type *>{PT}, std::vector<Result::Ptr>{V});
+    V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
+                                            std::vector<const Type *>{PT},
+                                            std::vector<Result::Ptr>{V});
   }
 
   return V;

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll
new file mode 100644
index 000000000000..9780b0864cc7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll
@@ -0,0 +1,1208 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -enable-arm-maskedldst -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <8 x half> @test_vld1q_f16(half* %base) {
+; CHECK-LABEL: test_vld1q_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  ret <8 x half> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vld1q_f32(float* %base) {
+; CHECK-LABEL: test_vld1q_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  ret <4 x float> %1
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vld1q_s8(i8* %base) {
+; CHECK-LABEL: test_vld1q_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vld1q_s16(i16* %base) {
+; CHECK-LABEL: test_vld1q_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vld1q_s32(i32* %base) {
+; CHECK-LABEL: test_vld1q_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vld1q_u8(i8* %base) {
+; CHECK-LABEL: test_vld1q_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vld1q_u16(i16* %base) {
+; CHECK-LABEL: test_vld1q_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vld1q_u32(i32* %base) {
+; CHECK-LABEL: test_vld1q_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vld1q_z_f16(half* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %2, <8 x half> zeroinitializer)
+  ret <8 x half> %3
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+
+declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>)
+
+define arm_aapcs_vfpcc <4 x float> @test_vld1q_z_f32(float* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %2, <4 x float> zeroinitializer)
+  ret <4 x float> %3
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
+
+define arm_aapcs_vfpcc <16 x i8> @test_vld1q_z_s8(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer)
+  ret <16 x i8> %3
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
+
+define arm_aapcs_vfpcc <8 x i16> @test_vld1q_z_s16(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer)
+  ret <8 x i16> %3
+}
+
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+
+define arm_aapcs_vfpcc <4 x i32> @test_vld1q_z_s32(i32* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer)
+  ret <4 x i32> %3
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+
+define arm_aapcs_vfpcc <16 x i8> @test_vld1q_z_u8(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer)
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vld1q_z_u16(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer)
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vld1q_z_u32(i32* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer)
+  ret <4 x i32> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_s8(i8* %base) {
+; CHECK-LABEL: test_vldrbq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_s16(i8* %base) {
+; CHECK-LABEL: test_vldrbq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.s16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_s32(i8* %base) {
+; CHECK-LABEL: test_vldrbq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.s32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <4 x i8>*
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_u8(i8* %base) {
+; CHECK-LABEL: test_vldrbq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_u16(i8* %base) {
+; CHECK-LABEL: test_vldrbq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_u32(i8* %base) {
+; CHECK-LABEL: test_vldrbq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <4 x i8>*
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_z_s8(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer)
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_z_s16(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.s16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <8 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %2, <8 x i8> zeroinitializer)
+  %4 = sext <8 x i8> %3 to <8 x i16>
+  ret <8 x i16> %4
+}
+
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_z_s32(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.s32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <4 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %2, <4 x i8> zeroinitializer)
+  %4 = sext <4 x i8> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
+declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
+
+define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_z_u8(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer)
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_z_u16(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <8 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %2, <8 x i8> zeroinitializer)
+  %4 = zext <8 x i8> %3 to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_z_u32(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <4 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %2, <4 x i8> zeroinitializer)
+  %4 = zext <4 x i8> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vldrhq_f16(half* %base) {
+; CHECK-LABEL: test_vldrhq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  ret <8 x half> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_s16(i16* %base) {
+; CHECK-LABEL: test_vldrhq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_s32(i16* %base) {
+; CHECK-LABEL: test_vldrhq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <4 x i16>*
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_u16(i16* %base) {
+; CHECK-LABEL: test_vldrhq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_u32(i16* %base) {
+; CHECK-LABEL: test_vldrhq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <4 x i16>*
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vldrhq_z_f16(half* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %2, <8 x half> zeroinitializer)
+  ret <8 x half> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_z_s16(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer)
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_z_s32(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.s32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <4 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %2, <4 x i16> zeroinitializer)
+  %4 = sext <4 x i16> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_z_u16(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer)
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_z_u32(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <4 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %2, <4 x i16> zeroinitializer)
+  %4 = zext <4 x i16> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vldrwq_f32(float* %base) {
+; CHECK-LABEL: test_vldrwq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  ret <4 x float> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_s32(i32* %base) {
+; CHECK-LABEL: test_vldrwq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_u32(i32* %base) {
+; CHECK-LABEL: test_vldrwq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vldrwq_z_f32(float* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrwq_z_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %2, <4 x float> zeroinitializer)
+  ret <4 x float> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_z_s32(i32* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrwq_z_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer)
+  ret <4 x i32> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_z_u32(i32* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrwq_z_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer)
+  ret <4 x i32> %3
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_f16(half* %base, <8 x half> %value) {
+; CHECK-LABEL: test_vst1q_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  store <8 x half> %value, <8 x half>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_f32(float* %base, <4 x float> %value) {
+; CHECK-LABEL: test_vst1q_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  store <4 x float> %value, <4 x float>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_s8(i8* %base, <16 x i8> %value) {
+; CHECK-LABEL: test_vst1q_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  store <16 x i8> %value, <16 x i8>* %0, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_s16(i16* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vst1q_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  store <8 x i16> %value, <8 x i16>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_s32(i32* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vst1q_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  store <4 x i32> %value, <4 x i32>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_u8(i8* %base, <16 x i8> %value) {
+; CHECK-LABEL: test_vst1q_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  store <16 x i8> %value, <16 x i8>* %0, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_u16(i16* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vst1q_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  store <8 x i16> %value, <8 x i16>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_u32(i32* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vst1q_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  store <4 x i32> %value, <4 x i32>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_p_f16(half* %base, <8 x half> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %value, <8 x half>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_f32(float* %base, <4 x float> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %value, <4 x float>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_s8(i8* %base, <16 x i8> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_s16(i16* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_s32(i32* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_u8(i8* %base, <16 x i8> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_p_u16(i16* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_p_u32(i32* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_s8(i8* %base, <16 x i8> %value) {
+; CHECK-LABEL: test_vstrbq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  store <16 x i8> %value, <16 x i8>* %0, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_s16(i8* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vstrbq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <8 x i16> %value to <8 x i8>
+  %1 = bitcast i8* %base to <8 x i8>*
+  store <8 x i8> %0, <8 x i8>* %1, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_s32(i8* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrbq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i8>
+  %1 = bitcast i8* %base to <4 x i8>*
+  store <4 x i8> %0, <4 x i8>* %1, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_u8(i8* %base, <16 x i8> %value) {
+; CHECK-LABEL: test_vstrbq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  store <16 x i8> %value, <16 x i8>* %0, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_u16(i8* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vstrbq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <8 x i16> %value to <8 x i8>
+  %1 = bitcast i8* %base to <8 x i8>*
+  store <8 x i8> %0, <8 x i8>* %1, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_u32(i8* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrbq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i8>
+  %1 = bitcast i8* %base to <4 x i8>*
+  store <4 x i8> %0, <4 x i8>* %1, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_s8(i8* %base, <16 x i8> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_s16(i8* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <8 x i16> %value to <8 x i8>
+  %1 = bitcast i8* %base to <8 x i8>*
+  %2 = zext i16 %p to i32
+  %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %0, <8 x i8>* %1, i32 1, <8 x i1> %3)
+  ret void
+}
+
+declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>)
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_s32(i8* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i8>
+  %1 = bitcast i8* %base to <4 x i8>*
+  %2 = zext i16 %p to i32
+  %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %0, <4 x i8>* %1, i32 1, <4 x i1> %3)
+  ret void
+}
+
+declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_u8(i8* %base, <16 x i8> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_u16(i8* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <8 x i16> %value to <8 x i8>
+  %1 = bitcast i8* %base to <8 x i8>*
+  %2 = zext i16 %p to i32
+  %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %0, <8 x i8>* %1, i32 1, <8 x i1> %3)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_u32(i8* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i8>
+  %1 = bitcast i8* %base to <4 x i8>*
+  %2 = zext i16 %p to i32
+  %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %0, <4 x i8>* %1, i32 1, <4 x i1> %3)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_f16(half* %base, <8 x half> %value) {
+; CHECK-LABEL: test_vstrhq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  store <8 x half> %value, <8 x half>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_s16(i16* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vstrhq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  store <8 x i16> %value, <8 x i16>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_s32(i16* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrhq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i16>
+  %1 = bitcast i16* %base to <4 x i16>*
+  store <4 x i16> %0, <4 x i16>* %1, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_u16(i16* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vstrhq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  store <8 x i16> %value, <8 x i16>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_u32(i16* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrhq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i16>
+  %1 = bitcast i16* %base to <4 x i16>*
+  store <4 x i16> %0, <4 x i16>* %1, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_f16(half* %base, <8 x half> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %value, <8 x half>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_s16(i16* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_s32(i16* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i16>
+  %1 = bitcast i16* %base to <4 x i16>*
+  %2 = zext i16 %p to i32
+  %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %0, <4 x i16>* %1, i32 2, <4 x i1> %3)
+  ret void
+}
+
+declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_u16(i16* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_u32(i16* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i16>
+  %1 = bitcast i16* %base to <4 x i16>*
+  %2 = zext i16 %p to i32
+  %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %0, <4 x i16>* %1, i32 2, <4 x i1> %3)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_f32(float* %base, <4 x float> %value) {
+; CHECK-LABEL: test_vstrwq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  store <4 x float> %value, <4 x float>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_s32(i32* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrwq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  store <4 x i32> %value, <4 x i32>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_u32(i32* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrwq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  store <4 x i32> %value, <4 x i32>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_p_f32(float* %base, <4 x float> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrwq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %value, <4 x float>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_p_s32(i32* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrwq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_p_u32(i32* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrwq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2)
+  ret void
+}