[clang] 25305a9 - [ARM][MVE] Add intrinsics for more immediate shifts.

Simon Tatham via cfe-commits cfe-commits at lists.llvm.org
Fri Dec 13 05:07:53 PST 2019


Author: Simon Tatham
Date: 2019-12-13T13:07:39Z
New Revision: 25305a9311d45bc602014b7ee7584e80675aaf59

URL: https://github.com/llvm/llvm-project/commit/25305a9311d45bc602014b7ee7584e80675aaf59
DIFF: https://github.com/llvm/llvm-project/commit/25305a9311d45bc602014b7ee7584e80675aaf59.diff

LOG: [ARM][MVE] Add intrinsics for more immediate shifts.

Summary:
This fills in the remaining shift operations that take a single vector
input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and
`vshll[bt]` families.

`vshll[bt]` (which shifts each input lane left into a double-width
output lane) is the most interesting one. There are separate MC
instruction ids for shifting by exactly the input lane width and
shifting by less than that, because the instruction encoding is so
completely different for the lane-width special case. So I had to
write two sets of patterns to match based on the immediate shift
count, which involved adding a ComplexPattern matcher to avoid the
general-case pattern accidentally matching the special case too. For
that family I've made sure to add an llc codegen test for both
versions of each instruction.

I'm experimenting with a new strategy for parametrising the isel
patterns for all these instructions: adding extra fields to the
relevant `Instruction` subclass itself, which are ignored by the
Tablegen backends that generate the MC data, but can be retrieved from
each instance of that instruction subclass when it's passed as a
template parameter to the multiclass that generates its isel patterns.
A nice effect of that is that I can fill in those informational fields
using `let` blocks, rather than having to type them out once per
instruction at `defm` time.

(As a result, quite a lot of existing instruction `def`s are
reindented by this patch, so it's clearer to read with whitespace
changes ignored.)

Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard

Reviewed By: MarkMurrayARM

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D71458

Added: 
    

Modified: 
    clang/include/clang/Basic/arm_mve.td
    clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm.c
    llvm/include/llvm/IR/IntrinsicsARM.td
    llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
    llvm/lib/Target/ARM/ARMInstrInfo.td
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vector-shift-imm.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 8bb567c573a3..9d9c067ade1c 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -606,6 +606,47 @@ let params = T.Int in {
   }
 }
 
+let params = T.Int in {
+  def vqshlq_n: Intrinsic<Vector, (args Vector:$v, imm_0toNm1:$sh),
+       (IRInt<"vqshl_imm", [Vector]> $v, $sh, (unsignedflag Scalar))>;
+  def vqshlq_m_n: Intrinsic<Vector, (args Vector:$inactive, Vector:$v,
+                                          imm_0toNm1:$sh, Predicate:$pred),
+       (IRInt<"vqshl_imm_predicated", [Vector, Predicate]>
+            $v, $sh, (unsignedflag Scalar), $pred, $inactive)>;
+
+  let pnt = PNT_NType in {
+    def vrshrq_n: Intrinsic<Vector, (args Vector:$v, imm_1toN:$sh),
+         (IRInt<"vrshr_imm", [Vector]> $v, $sh, (unsignedflag Scalar))>;
+    defm vrshrq: IntrinsicMX<Vector, (args Vector:$v, imm_1toN:$sh,
+                                           Predicate:$pred),
+         (IRInt<"vrshr_imm_predicated", [Vector, Predicate]>
+              $v, $sh, (unsignedflag Scalar), $pred, $inactive), "_n">;
+  }
+}
+
+let params = T.Signed, pnt = PNT_NType in {
+  def vqshluq_n: Intrinsic<UVector, (args Vector:$v, imm_0toNm1:$sh),
+       (IRInt<"vqshlu_imm", [Vector]> $v, $sh)>;
+  def vqshluq_m_n: Intrinsic<UVector, (args UVector:$inactive, Vector:$v,
+                                            imm_0toNm1:$sh, Predicate:$pred),
+       (IRInt<"vqshlu_imm_predicated", [Vector, Predicate]>
+            $v, $sh, $pred, $inactive)>;
+}
+
+multiclass vshll_imm<int top> {
+  let params = !listconcat(T.Int8, T.Int16), pnt = PNT_NType in {
+    def _n: Intrinsic<DblVector, (args Vector:$v, imm_1toN:$sh),
+        (IRInt<"vshll_imm", [DblVector, Vector]>
+            $v, $sh, (unsignedflag Scalar), top)>;
+    defm "": IntrinsicMX<DblVector, (args Vector:$v, imm_1toN:$sh,
+                                          Predicate:$pred),
+        (IRInt<"vshll_imm_predicated", [DblVector, Vector, Predicate]>
+            $v, $sh, (unsignedflag Scalar), top, $pred, $inactive), "_n">;
+  }
+}
+defm vshllbq : vshll_imm<0>;
+defm vshlltq : vshll_imm<1>;
+
 // Base class for the scalar shift intrinsics.
 class ScalarShift<Type argtype, dag shiftCountArg, dag shiftCodeGen>:
   Intrinsic<argtype, !con((args argtype:$value), shiftCountArg), shiftCodeGen> {

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm.c b/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm.c
index 200273c03654..2128d0801c6a 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm.c
@@ -720,3 +720,918 @@ uint32x4_t test_vshrq_x_n_u32(uint32x4_t a, mve_pred16_t p)
     return vshrq_x_n_u32(a, 6, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vqshlq_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqshl.imm.v16i8(<16 x i8> [[A:%.*]], i32 3, i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+int8x16_t test_vqshlq_n_s8(int8x16_t a)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_n(a, 3);
+#else /* POLYMORPHIC */
+    return vqshlq_n_s8(a, 3);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqshl.imm.v8i16(<8 x i16> [[A:%.*]], i32 4, i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vqshlq_n_s16(int16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_n(a, 4);
+#else /* POLYMORPHIC */
+    return vqshlq_n_s16(a, 4);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> [[A:%.*]], i32 4, i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vqshlq_n_s32(int32x4_t a)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_n(a, 4);
+#else /* POLYMORPHIC */
+    return vqshlq_n_s32(a, 4);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqshl.imm.v16i8(<16 x i8> [[A:%.*]], i32 0, i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vqshlq_n_u8(uint8x16_t a)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_n(a, 0);
+#else /* POLYMORPHIC */
+    return vqshlq_n_u8(a, 0);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqshl.imm.v8i16(<8 x i16> [[A:%.*]], i32 13, i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+uint16x8_t test_vqshlq_n_u16(uint16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_n(a, 13);
+#else /* POLYMORPHIC */
+    return vqshlq_n_u16(a, 13);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> [[A:%.*]], i32 6, i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vqshlq_n_u32(uint32x4_t a)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_n(a, 6);
+#else /* POLYMORPHIC */
+    return vqshlq_n_u32(a, 6);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshluq_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqshlu.imm.v16i8(<16 x i8> [[A:%.*]], i32 5)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vqshluq_n_s8(int8x16_t a)
+{
+#ifdef POLYMORPHIC
+    return vqshluq(a, 5);
+#else /* POLYMORPHIC */
+    return vqshluq_n_s8(a, 5);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshluq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqshlu.imm.v8i16(<8 x i16> [[A:%.*]], i32 5)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+uint16x8_t test_vqshluq_n_s16(int16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vqshluq(a, 5);
+#else /* POLYMORPHIC */
+    return vqshluq_n_s16(a, 5);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshluq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqshlu.imm.v4i32(<4 x i32> [[A:%.*]], i32 4)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vqshluq_n_s32(int32x4_t a)
+{
+#ifdef POLYMORPHIC
+    return vqshluq(a, 4);
+#else /* POLYMORPHIC */
+    return vqshluq_n_s32(a, 4);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrshr.imm.v16i8(<16 x i8> [[A:%.*]], i32 4, i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+int8x16_t test_vrshrq_n_s8(int8x16_t a)
+{
+#ifdef POLYMORPHIC
+    return vrshrq(a, 4);
+#else /* POLYMORPHIC */
+    return vrshrq_n_s8(a, 4);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrshr.imm.v8i16(<8 x i16> [[A:%.*]], i32 12, i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vrshrq_n_s16(int16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vrshrq(a, 12);
+#else /* POLYMORPHIC */
+    return vrshrq_n_s16(a, 12);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrshr.imm.v4i32(<4 x i32> [[A:%.*]], i32 30, i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vrshrq_n_s32(int32x4_t a)
+{
+#ifdef POLYMORPHIC
+    return vrshrq(a, 30);
+#else /* POLYMORPHIC */
+    return vrshrq_n_s32(a, 30);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrshr.imm.v16i8(<16 x i8> [[A:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vrshrq_n_u8(uint8x16_t a)
+{
+#ifdef POLYMORPHIC
+    return vrshrq(a, 1);
+#else /* POLYMORPHIC */
+    return vrshrq_n_u8(a, 1);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrshr.imm.v8i16(<8 x i16> [[A:%.*]], i32 15, i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+uint16x8_t test_vrshrq_n_u16(uint16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vrshrq(a, 15);
+#else /* POLYMORPHIC */
+    return vrshrq_n_u16(a, 15);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrshr.imm.v4i32(<4 x i32> [[A:%.*]], i32 20, i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vrshrq_n_u32(uint32x4_t a)
+{
+#ifdef POLYMORPHIC
+    return vrshrq(a, 20);
+#else /* POLYMORPHIC */
+    return vrshrq_n_u32(a, 20);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqshl.imm.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 6, i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vqshlq_m_n_s8(int8x16_t inactive, int8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_m_n(inactive, a, 6, p);
+#else /* POLYMORPHIC */
+    return vqshlq_m_n_s8(inactive, a, 6, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqshl.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 13, i32 0, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vqshlq_m_n_s16(int16x8_t inactive, int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_m_n(inactive, a, 13, p);
+#else /* POLYMORPHIC */
+    return vqshlq_m_n_s16(inactive, a, 13, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqshl.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 14, i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqshlq_m_n_s32(int32x4_t inactive, int32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_m_n(inactive, a, 14, p);
+#else /* POLYMORPHIC */
+    return vqshlq_m_n_s32(inactive, a, 14, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_m_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqshl.imm.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 4, i32 1, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vqshlq_m_n_u8(uint8x16_t inactive, uint8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_m_n(inactive, a, 4, p);
+#else /* POLYMORPHIC */
+    return vqshlq_m_n_u8(inactive, a, 4, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqshl.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 9, i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vqshlq_m_n_u16(uint16x8_t inactive, uint16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_m_n(inactive, a, 9, p);
+#else /* POLYMORPHIC */
+    return vqshlq_m_n_u16(inactive, a, 9, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshlq_m_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqshl.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 25, i32 1, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vqshlq_m_n_u32(uint32x4_t inactive, uint32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqshlq_m_n(inactive, a, 25, p);
+#else /* POLYMORPHIC */
+    return vqshlq_m_n_u32(inactive, a, 25, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshluq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqshlu.imm.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 2, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vqshluq_m_n_s8(uint8x16_t inactive, int8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqshluq_m(inactive, a, 2, p);
+#else /* POLYMORPHIC */
+    return vqshluq_m_n_s8(inactive, a, 2, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshluq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqshlu.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 12, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vqshluq_m_n_s16(uint16x8_t inactive, int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqshluq_m(inactive, a, 12, p);
+#else /* POLYMORPHIC */
+    return vqshluq_m_n_s16(inactive, a, 12, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqshluq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqshlu.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 24, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vqshluq_m_n_s32(uint32x4_t inactive, int32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqshluq_m(inactive, a, 24, p);
+#else /* POLYMORPHIC */
+    return vqshluq_m_n_s32(inactive, a, 24, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vrshr.imm.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 2, i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vrshrq_m_n_s8(int8x16_t inactive, int8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_m(inactive, a, 2, p);
+#else /* POLYMORPHIC */
+    return vrshrq_m_n_s8(inactive, a, 2, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vrshr.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 11, i32 0, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vrshrq_m_n_s16(int16x8_t inactive, int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_m(inactive, a, 11, p);
+#else /* POLYMORPHIC */
+    return vrshrq_m_n_s16(inactive, a, 11, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vrshr.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 24, i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vrshrq_m_n_s32(int32x4_t inactive, int32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_m(inactive, a, 24, p);
+#else /* POLYMORPHIC */
+    return vrshrq_m_n_s32(inactive, a, 24, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_m_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vrshr.imm.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 7, i32 1, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vrshrq_m_n_u8(uint8x16_t inactive, uint8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_m(inactive, a, 7, p);
+#else /* POLYMORPHIC */
+    return vrshrq_m_n_u8(inactive, a, 7, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vrshr.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 4, i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vrshrq_m_n_u16(uint16x8_t inactive, uint16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_m(inactive, a, 4, p);
+#else /* POLYMORPHIC */
+    return vrshrq_m_n_u16(inactive, a, 4, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_m_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vrshr.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 27, i32 1, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vrshrq_m_n_u32(uint32x4_t inactive, uint32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_m(inactive, a, 27, p);
+#else /* POLYMORPHIC */
+    return vrshrq_m_n_u32(inactive, a, 27, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_x_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vrshr.imm.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 3, i32 0, <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vrshrq_x_n_s8(int8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_x(a, 3, p);
+#else /* POLYMORPHIC */
+    return vrshrq_x_n_s8(a, 3, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_x_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vrshr.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 12, i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vrshrq_x_n_s16(int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_x(a, 12, p);
+#else /* POLYMORPHIC */
+    return vrshrq_x_n_s16(a, 12, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_x_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vrshr.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 20, i32 0, <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vrshrq_x_n_s32(int32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_x(a, 20, p);
+#else /* POLYMORPHIC */
+    return vrshrq_x_n_s32(a, 20, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_x_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vrshr.imm.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 1, i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vrshrq_x_n_u8(uint8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_x(a, 1, p);
+#else /* POLYMORPHIC */
+    return vrshrq_x_n_u8(a, 1, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_x_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vrshr.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 13, i32 1, <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vrshrq_x_n_u16(uint16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_x(a, 13, p);
+#else /* POLYMORPHIC */
+    return vrshrq_x_n_u16(a, 13, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrshrq_x_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vrshr.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 6, i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vrshrq_x_n_u32(uint32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrshrq_x(a, 6, p);
+#else /* POLYMORPHIC */
+    return vrshrq_x_n_u32(a, 6, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> [[A:%.*]], i32 2, i32 0, i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vshllbq_n_s8(int8x16_t a)
+{
+#ifdef POLYMORPHIC
+    return vshllbq(a, 2);
+#else /* POLYMORPHIC */
+    return vshllbq_n_s8(a, 2);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> [[A:%.*]], i32 13, i32 0, i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vshllbq_n_s16(int16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vshllbq(a, 13);
+#else /* POLYMORPHIC */
+    return vshllbq_n_s16(a, 13);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> [[A:%.*]], i32 5, i32 1, i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+uint16x8_t test_vshllbq_n_u8(uint8x16_t a)
+{
+#ifdef POLYMORPHIC
+    return vshllbq(a, 5);
+#else /* POLYMORPHIC */
+    return vshllbq_n_u8(a, 5);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> [[A:%.*]], i32 6, i32 1, i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vshllbq_n_u16(uint16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vshllbq(a, 6);
+#else /* POLYMORPHIC */
+    return vshllbq_n_u16(a, 6);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> [[A:%.*]], i32 7, i32 0, i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vshlltq_n_s8(int8x16_t a)
+{
+#ifdef POLYMORPHIC
+    return vshlltq(a, 7);
+#else /* POLYMORPHIC */
+    return vshlltq_n_s8(a, 7);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> [[A:%.*]], i32 2, i32 0, i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vshlltq_n_s16(int16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vshlltq(a, 2);
+#else /* POLYMORPHIC */
+    return vshlltq_n_s16(a, 2);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> [[A:%.*]], i32 7, i32 1, i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+uint16x8_t test_vshlltq_n_u8(uint8x16_t a)
+{
+#ifdef POLYMORPHIC
+    return vshlltq(a, 7);
+#else /* POLYMORPHIC */
+    return vshlltq_n_u8(a, 7);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> [[A:%.*]], i32 14, i32 1, i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vshlltq_n_u16(uint16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vshlltq(a, 14);
+#else /* POLYMORPHIC */
+    return vshlltq_n_u16(a, 14);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 6, i32 0, i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vshllbq_m_n_s8(int16x8_t inactive, int8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshllbq_m(inactive, a, 6, p);
+#else /* POLYMORPHIC */
+    return vshllbq_m_n_s8(inactive, a, 6, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 10, i32 0, i32 0, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vshllbq_m_n_s16(int32x4_t inactive, int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshllbq_m(inactive, a, 10, p);
+#else /* POLYMORPHIC */
+    return vshllbq_m_n_s16(inactive, a, 10, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_m_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 3, i32 1, i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vshllbq_m_n_u8(uint16x8_t inactive, uint8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshllbq_m(inactive, a, 3, p);
+#else /* POLYMORPHIC */
+    return vshllbq_m_n_u8(inactive, a, 3, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 14, i32 1, i32 0, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vshllbq_m_n_u16(uint32x4_t inactive, uint16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshllbq_m(inactive, a, 14, p);
+#else /* POLYMORPHIC */
+    return vshllbq_m_n_u16(inactive, a, 14, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 4, i32 0, i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vshlltq_m_n_s8(int16x8_t inactive, int8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshlltq_m(inactive, a, 4, p);
+#else /* POLYMORPHIC */
+    return vshlltq_m_n_s8(inactive, a, 4, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 12, i32 0, i32 1, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vshlltq_m_n_s16(int32x4_t inactive, int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshlltq_m(inactive, a, 12, p);
+#else /* POLYMORPHIC */
+    return vshlltq_m_n_s16(inactive, a, 12, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_m_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 2, i32 1, i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vshlltq_m_n_u8(uint16x8_t inactive, uint8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshlltq_m(inactive, a, 2, p);
+#else /* POLYMORPHIC */
+    return vshlltq_m_n_u8(inactive, a, 2, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 9, i32 1, i32 1, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vshlltq_m_n_u16(uint32x4_t inactive, uint16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshlltq_m(inactive, a, 9, p);
+#else /* POLYMORPHIC */
+    return vshlltq_m_n_u16(inactive, a, 9, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_x_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 1, i32 0, i32 0, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vshllbq_x_n_s8(int8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshllbq_x(a, 1, p);
+#else /* POLYMORPHIC */
+    return vshllbq_x_n_s8(a, 1, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_x_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 10, i32 0, i32 0, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vshllbq_x_n_s16(int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshllbq_x(a, 10, p);
+#else /* POLYMORPHIC */
+    return vshllbq_x_n_s16(a, 10, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_x_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 6, i32 1, i32 0, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vshllbq_x_n_u8(uint8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshllbq_x(a, 6, p);
+#else /* POLYMORPHIC */
+    return vshllbq_x_n_u8(a, 6, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshllbq_x_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 10, i32 1, i32 0, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vshllbq_x_n_u16(uint16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshllbq_x(a, 10, p);
+#else /* POLYMORPHIC */
+    return vshllbq_x_n_u16(a, 10, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_x_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 2, i32 0, i32 1, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vshlltq_x_n_s8(int8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshlltq_x(a, 2, p);
+#else /* POLYMORPHIC */
+    return vshlltq_x_n_s8(a, 2, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_x_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 6, i32 0, i32 1, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vshlltq_x_n_s16(int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshlltq_x(a, 6, p);
+#else /* POLYMORPHIC */
+    return vshlltq_x_n_s16(a, 6, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_x_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 5, i32 1, i32 1, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vshlltq_x_n_u8(uint8x16_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshlltq_x(a, 5, p);
+#else /* POLYMORPHIC */
+    return vshlltq_x_n_u8(a, 5, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vshlltq_x_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 3, i32 1, i32 1, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vshlltq_x_n_u16(uint16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vshlltq_x(a, 3, p);
+#else /* POLYMORPHIC */
+    return vshlltq_x_n_u16(a, 3, p);
+#endif /* POLYMORPHIC */
+}
+

diff  --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 8995f969e2ea..a10620612e23 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -876,10 +876,19 @@ defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty],
    [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
 
 multiclass MVEPredicated<list<LLVMType> rets, list<LLVMType> params,
-                         LLVMType pred, list<IntrinsicProperty> props = []> {
+                         LLVMType pred = llvm_anyvector_ty,
+                         list<IntrinsicProperty> props = []> {
   def "": Intrinsic<rets, params, props>;
   def _predicated: Intrinsic<rets, params # [pred], props>;
 }
+multiclass MVEPredicatedM<list<LLVMType> rets, list<LLVMType> params,
+                          LLVMType pred = llvm_anyvector_ty,
+                          list<IntrinsicProperty> props = [IntrNoMem]> {
+  def "": Intrinsic<rets, params, props>;
+  def _predicated: Intrinsic<rets, params # [pred,
+      !if(!eq(!cast<string>(rets[0]), "llvm_anyvector_ty"),
+          LLVMMatchType<0>, rets[0])], props>;
+}
 
 defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty],
    [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty, [IntrNoMem]>;
@@ -921,6 +930,16 @@ def int_arm_mve_shr_imm_predicated: Intrinsic<[llvm_anyvector_ty],
     llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
 
+defm int_arm_mve_vqshl_imm: MVEPredicatedM<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, llvm_i32_ty /*shiftcount*/, llvm_i32_ty /*unsigned*/]>;
+defm int_arm_mve_vrshr_imm: MVEPredicatedM<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, llvm_i32_ty /*shiftcount*/, llvm_i32_ty /*unsigned*/]>;
+defm int_arm_mve_vqshlu_imm: MVEPredicatedM<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, llvm_i32_ty /*shiftcount*/]>;
+defm int_arm_mve_vshll_imm: MVEPredicatedM<[llvm_anyvector_ty],
+   [llvm_anyvector_ty, llvm_i32_ty /*shiftcount*/, llvm_i32_ty /*unsigned*/,
+                       llvm_i32_ty /*top-half*/]>;
+
 // MVE scalar shifts.
 class ARM_MVE_qrshift_single<list<LLVMType> value,
                              list<LLVMType> saturate = []> :

diff  --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index cf3fd62e4934..6dd56b35d0ab 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -159,6 +159,9 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
                              SDValue &OffReg, SDValue &ShImm);
   bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
 
+  template<int Min, int Max>
+  bool SelectImmediateInRange(SDValue N, SDValue &OffImm);
+
   inline bool is_so_imm(unsigned Imm) const {
     return ARM_AM::getSOImmVal(Imm) != -1;
   }
@@ -1383,6 +1386,16 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
   return false;
 }
 
+template <int Min, int Max>
+bool ARMDAGToDAGISel::SelectImmediateInRange(SDValue N, SDValue &OffImm) {
+  int Val;
+  if (isScaledConstantInRange(N, 1, Min, Max, Val)) {
+    OffImm = CurDAG->getTargetConstant(Val, SDLoc(N), MVT::i32);
+    return true;
+  }
+  return false;
+}
+
 bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
                                             SDValue &Base,
                                             SDValue &OffReg, SDValue &ShImm) {

diff  --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 155e0efff1a8..1cab1747ff4e 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -935,7 +935,10 @@ def MVEShiftImm1_7AsmOperand: ImmAsmOperand<1,7> {
   // encodings allow.
   let DiagnosticString = "operand must be an immediate in the range [1,8]";
 }
-def mve_shift_imm1_7 : Operand<i32> {
+def mve_shift_imm1_7 : Operand<i32>,
+    // SelectImmediateInRange / isScaledConstantInRange uses a
+    // half-open interval, so the parameters <1,8> mean 1-7 inclusive
+    ComplexPattern<i32, 1, "SelectImmediateInRange<1,8>", [], []> {
   let ParserMatchClass = MVEShiftImm1_7AsmOperand;
   let EncoderMethod = "getMVEShiftImmOpValue";
 }
@@ -948,7 +951,10 @@ def MVEShiftImm1_15AsmOperand: ImmAsmOperand<1,15> {
   // encodings allow.
   let DiagnosticString = "operand must be an immediate in the range [1,16]";
 }
-def mve_shift_imm1_15 : Operand<i32> {
+def mve_shift_imm1_15 : Operand<i32>,
+    // SelectImmediateInRange / isScaledConstantInRange uses a
+    // half-open interval, so the parameters <1,16> mean 1-15 inclusive
+    ComplexPattern<i32, 1, "SelectImmediateInRange<1,16>", [], []> {
   let ParserMatchClass = MVEShiftImm1_15AsmOperand;
   let EncoderMethod = "getMVEShiftImmOpValue";
 }

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index a1a8614e4a7b..21f0d5e86790 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2315,8 +2315,8 @@ let Predicates = [HasMVEInt] in {
 
 
 class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
-                    dag immops, list<dag> pattern=[]>
-  : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$Qm), immops),
+                    Operand immtype, list<dag> pattern=[]>
+  : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm, immtype:$imm),
                   iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", pattern> {
   let Inst{28} = U;
   let Inst{25-23} = 0b101;
@@ -2325,6 +2325,9 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
   let Inst{11-6} = 0b111101;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+
+  // For the MVE_VSHLL_patterns multiclass to refer to
+  Operand immediateType = immtype;
 }
 
 // The immediate VSHLL instructions accept shift counts from 1 up to
@@ -2333,7 +2336,7 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
 
 class MVE_VSHLL_imm8<string iname, string suffix,
                      bit U, bit th, list<dag> pattern=[]>
-  : MVE_VSHLL_imm<iname, suffix, U, th, (ins mve_shift_imm1_7:$imm), pattern> {
+  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_7, pattern> {
   bits<3> imm;
   let Inst{20-19} = 0b01;
   let Inst{18-16} = imm;
@@ -2341,7 +2344,7 @@ class MVE_VSHLL_imm8<string iname, string suffix,
 
 class MVE_VSHLL_imm16<string iname, string suffix,
                       bit U, bit th, list<dag> pattern=[]>
-  : MVE_VSHLL_imm<iname, suffix, U, th, (ins mve_shift_imm1_15:$imm), pattern> {
+  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_15, pattern> {
   bits<4> imm;
   let Inst{20} = 0b1;
   let Inst{19-16} = imm;
@@ -2385,6 +2388,45 @@ defm MVE_VSHLL_lws16 : MVE_VSHLL_lw<"vshll", "s16", 0b01, 0b0, "$Qd, $Qm, #16">;
 defm MVE_VSHLL_lwu8  : MVE_VSHLL_lw<"vshll", "u8",  0b00, 0b1, "$Qd, $Qm, #8">;
 defm MVE_VSHLL_lwu16 : MVE_VSHLL_lw<"vshll", "u16", 0b01, 0b1, "$Qd, $Qm, #16">;
 
+multiclass MVE_VSHLL_patterns<MVEVectorVTInfo VTI, int top> {
+  // A succession of local variable definitions, via singleton
+  // foreach, to make the actual patterns legible
+  foreach suffix = [!strconcat(VTI.Suffix, !if(top, "th", "bh"))] in
+  foreach inst_imm = [!cast<MVE_VSHLL_imm>("MVE_VSHLL_imm" # suffix)] in
+  foreach inst_lw = [!cast<MVE_VSHLL_by_lane_width>("MVE_VSHLL_lw" # suffix)] in
+  foreach unpred_int = [int_arm_mve_vshll_imm] in
+  foreach pred_int = [int_arm_mve_vshll_imm_predicated] in
+  foreach imm = [inst_imm.immediateType] in {
+
+    def : Pat<(VTI.DblVec (unpred_int (VTI.Vec MQPR:$src), imm:$imm,
+                                      (i32 VTI.Unsigned), (i32 top))),
+              (VTI.DblVec (inst_imm   (VTI.Vec MQPR:$src), imm:$imm))>;
+    def : Pat<(VTI.DblVec (unpred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits),
+                                      (i32 VTI.Unsigned), (i32 top))),
+              (VTI.DblVec (inst_lw    (VTI.Vec MQPR:$src)))>;
+
+    def : Pat<(VTI.DblVec (pred_int   (VTI.Vec MQPR:$src), imm:$imm,
+                                      (i32 VTI.Unsigned), (i32 top),
+                                      (VTI.Pred VCCR:$mask),
+                                      (VTI.DblVec MQPR:$inactive))),
+              (VTI.DblVec (inst_imm   (VTI.Vec MQPR:$src), imm:$imm,
+                                      ARMVCCThen, (VTI.Pred VCCR:$mask),
+                                      (VTI.DblVec MQPR:$inactive)))>;
+    def : Pat<(VTI.DblVec (pred_int   (VTI.Vec MQPR:$src), (i32 VTI.LaneBits),
+                                      (i32 VTI.Unsigned), (i32 top),
+                                      (VTI.Pred VCCR:$mask),
+                                      (VTI.DblVec MQPR:$inactive))),
+              (VTI.DblVec (inst_lw    (VTI.Vec MQPR:$src), ARMVCCThen,
+                                      (VTI.Pred VCCR:$mask),
+                                      (VTI.DblVec MQPR:$inactive)))>;
+
+  }
+}
+
+foreach VTI = [MVE_v16s8, MVE_v8s16, MVE_v16u8, MVE_v8u16] in
+  foreach top = [0, 1] in
+    defm : MVE_VSHLL_patterns<VTI, top>;
+
 class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
                dag immops, list<dag> pattern=[]>
   : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops),
@@ -2606,6 +2648,13 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b0;
   let validForTailPredication = 1;
+
+  // For the MVE_shift_imm_patterns multiclass to refer to
+  MVEVectorVTInfo VTI;
+  Operand immediateType;
+  Intrinsic unpred_int;
+  Intrinsic pred_int;
+  dag unsignedFlag = (?);
 }
 
 class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
@@ -2645,50 +2694,49 @@ def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,(ins imm0_31:$imm)> {
   let Inst{21} = 0b1;
 }
 
-class MVE_VQSHL_imm<string suffix, dag imm>
-  : MVE_shift_with_imm<"vqshl", suffix, (outs MQPR:$Qd),
-                       !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+class MVE_VQSHL_imm<MVEVectorVTInfo VTI_, Operand immType>
+  : MVE_shift_with_imm<"vqshl", VTI_.Suffix, (outs MQPR:$Qd),
+                       (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
                        vpred_r, ""> {
   bits<6> imm;
 
+  let Inst{28} = VTI_.Unsigned;
   let Inst{25-24} = 0b11;
   let Inst{21-16} = imm;
   let Inst{10-8} = 0b111;
-}
-
-def MVE_VQSHLimms8 : MVE_VQSHL_imm<"s8", (ins imm0_7:$imm)> {
-  let Inst{28} = 0b0;
-  let Inst{21-19} = 0b001;
-}
-
-def MVE_VQSHLimmu8 : MVE_VQSHL_imm<"u8", (ins imm0_7:$imm)> {
-  let Inst{28} = 0b1;
-  let Inst{21-19} = 0b001;
-}
-
-def MVE_VQSHLimms16 : MVE_VQSHL_imm<"s16", (ins imm0_15:$imm)> {
-  let Inst{28} = 0b0;
-  let Inst{21-20} = 0b01;
-}
 
-def MVE_VQSHLimmu16 : MVE_VQSHL_imm<"u16", (ins imm0_15:$imm)> {
-  let Inst{28} = 0b1;
-  let Inst{21-20} = 0b01;
-}
-
-def MVE_VQSHLimms32 : MVE_VQSHL_imm<"s32", (ins imm0_31:$imm)> {
-  let Inst{28} = 0b0;
-  let Inst{21} = 0b1;
+  let VTI = VTI_;
+  let immediateType = immType;
+  let unsignedFlag = (? (i32 VTI.Unsigned));
 }
 
-def MVE_VQSHLimmu32 : MVE_VQSHL_imm<"u32", (ins imm0_31:$imm)> {
-  let Inst{28} = 0b1;
-  let Inst{21} = 0b1;
+let unpred_int = int_arm_mve_vqshl_imm,
+    pred_int = int_arm_mve_vqshl_imm_predicated in {
+  def MVE_VQSHLimms8 : MVE_VQSHL_imm<MVE_v16s8, imm0_7> {
+    let Inst{21-19} = 0b001;
+  }
+  def MVE_VQSHLimmu8 : MVE_VQSHL_imm<MVE_v16u8, imm0_7> {
+    let Inst{21-19} = 0b001;
+  }
+  
+  def MVE_VQSHLimms16 : MVE_VQSHL_imm<MVE_v8s16, imm0_15> {
+    let Inst{21-20} = 0b01;
+  }
+  def MVE_VQSHLimmu16 : MVE_VQSHL_imm<MVE_v8u16, imm0_15> {
+    let Inst{21-20} = 0b01;
+  }
+  
+  def MVE_VQSHLimms32 : MVE_VQSHL_imm<MVE_v4s32, imm0_31> {
+    let Inst{21} = 0b1;
+  }
+  def MVE_VQSHLimmu32 : MVE_VQSHL_imm<MVE_v4u32, imm0_31> {
+    let Inst{21} = 0b1;
+  }
 }
 
-class MVE_VQSHLU_imm<string suffix, dag imm>
-  : MVE_shift_with_imm<"vqshlu", suffix, (outs MQPR:$Qd),
-                       !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+class MVE_VQSHLU_imm<MVEVectorVTInfo VTI_, Operand immType>
+  : MVE_shift_with_imm<"vqshlu", VTI_.Suffix, (outs MQPR:$Qd),
+                       (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
                        vpred_r, ""> {
   bits<6> imm;
 
@@ -2696,61 +2744,103 @@ class MVE_VQSHLU_imm<string suffix, dag imm>
   let Inst{25-24} = 0b11;
   let Inst{21-16} = imm;
   let Inst{10-8} = 0b110;
-}
 
-def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<"s8", (ins imm0_7:$imm)> {
-  let Inst{21-19} = 0b001;
+  let VTI = VTI_;
+  let immediateType = immType;
 }
 
-def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<"s16", (ins imm0_15:$imm)> {
-  let Inst{21-20} = 0b01;
-}
+let unpred_int = int_arm_mve_vqshlu_imm,
+    pred_int = int_arm_mve_vqshlu_imm_predicated in {
+  def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<MVE_v16s8, imm0_7> {
+    let Inst{21-19} = 0b001;
+  }
 
-def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<"s32", (ins imm0_31:$imm)> {
-  let Inst{21} = 0b1;
+  def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<MVE_v8s16, imm0_15> {
+    let Inst{21-20} = 0b01;
+  }
+
+  def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<MVE_v4s32, imm0_31> {
+    let Inst{21} = 0b1;
+  }
 }
 
-class MVE_VRSHR_imm<string suffix, dag imm>
-  : MVE_shift_with_imm<"vrshr", suffix, (outs MQPR:$Qd),
-                       !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+class MVE_VRSHR_imm<MVEVectorVTInfo VTI_, Operand immType>
+  : MVE_shift_with_imm<"vrshr", VTI_.Suffix, (outs MQPR:$Qd),
+                       (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
                        vpred_r, ""> {
   bits<6> imm;
 
+  let Inst{28} = VTI_.Unsigned;
   let Inst{25-24} = 0b11;
   let Inst{21-16} = imm;
   let Inst{10-8} = 0b010;
-}
 
-def MVE_VRSHR_imms8 : MVE_VRSHR_imm<"s8", (ins shr_imm8:$imm)> {
-  let Inst{28} = 0b0;
-  let Inst{21-19} = 0b001;
+  let VTI = VTI_;
+  let immediateType = immType;
+  let unsignedFlag = (? (i32 VTI.Unsigned));
 }
 
-def MVE_VRSHR_immu8 : MVE_VRSHR_imm<"u8", (ins shr_imm8:$imm)> {
-  let Inst{28} = 0b1;
-  let Inst{21-19} = 0b001;
-}
+let unpred_int = int_arm_mve_vrshr_imm,
+    pred_int = int_arm_mve_vrshr_imm_predicated in {
+  def MVE_VRSHR_imms8 : MVE_VRSHR_imm<MVE_v16s8, shr_imm8> {
+    let Inst{21-19} = 0b001;
+  }
 
-def MVE_VRSHR_imms16 : MVE_VRSHR_imm<"s16", (ins shr_imm16:$imm)> {
-  let Inst{28} = 0b0;
-  let Inst{21-20} = 0b01;
-}
+  def MVE_VRSHR_immu8 : MVE_VRSHR_imm<MVE_v16u8, shr_imm8> {
+    let Inst{21-19} = 0b001;
+  }
 
-def MVE_VRSHR_immu16 : MVE_VRSHR_imm<"u16", (ins shr_imm16:$imm)> {
-  let Inst{28} = 0b1;
-  let Inst{21-20} = 0b01;
-}
+  def MVE_VRSHR_imms16 : MVE_VRSHR_imm<MVE_v8s16, shr_imm16> {
+    let Inst{21-20} = 0b01;
+  }
 
-def MVE_VRSHR_imms32 : MVE_VRSHR_imm<"s32", (ins shr_imm32:$imm)> {
-  let Inst{28} = 0b0;
-  let Inst{21} = 0b1;
-}
+  def MVE_VRSHR_immu16 : MVE_VRSHR_imm<MVE_v8u16, shr_imm16> {
+    let Inst{21-20} = 0b01;
+  }
 
-def MVE_VRSHR_immu32 : MVE_VRSHR_imm<"u32", (ins shr_imm32:$imm)> {
-  let Inst{28} = 0b1;
-  let Inst{21} = 0b1;
+  def MVE_VRSHR_imms32 : MVE_VRSHR_imm<MVE_v4s32, shr_imm32> {
+    let Inst{21} = 0b1;
+  }
+
+  def MVE_VRSHR_immu32 : MVE_VRSHR_imm<MVE_v4u32, shr_imm32> {
+    let Inst{21} = 0b1;
+  }
 }
 
+multiclass MVE_shift_imm_patterns<MVE_shift_with_imm inst> {
+  def : Pat<(inst.VTI.Vec !con((inst.unpred_int (inst.VTI.Vec MQPR:$src),
+                                                inst.immediateType:$imm),
+                               inst.unsignedFlag)),
+            (inst.VTI.Vec (inst (inst.VTI.Vec MQPR:$src),
+                                inst.immediateType:$imm))>;
+
+  def : Pat<(inst.VTI.Vec !con((inst.pred_int (inst.VTI.Vec MQPR:$src),
+                                              inst.immediateType:$imm),
+                               inst.unsignedFlag,
+                               (? (inst.VTI.Pred VCCR:$mask),
+                                  (inst.VTI.Vec MQPR:$inactive)))),
+            (inst.VTI.Vec (inst (inst.VTI.Vec MQPR:$src),
+                                inst.immediateType:$imm,
+                                ARMVCCThen, (inst.VTI.Pred VCCR:$mask),
+                                (inst.VTI.Vec MQPR:$inactive)))>;
+}
+
+defm : MVE_shift_imm_patterns<MVE_VQSHLimms8>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimmu8>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimms16>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimmu16>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimms32>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimmu32>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms8>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms16>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms32>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_imms8>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_immu8>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_imms16>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_immu16>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_imms32>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_immu32>;
+
 class MVE_VSHR_imm<string suffix, dag imm>
   : MVE_shift_with_imm<"vshr", suffix, (outs MQPR:$Qd),
                        !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vector-shift-imm.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vector-shift-imm.ll
index 86228ef94b38..dd4980f99b97 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vector-shift-imm.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vector-shift-imm.ll
@@ -385,6 +385,1058 @@ entry:
   ret <4 x i32> %2
 }
 
+define arm_aapcs_vfpcc <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vqshlq_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqshl.s8 q0, q0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <16 x i8> @llvm.arm.mve.vqshl.imm.v16i8(<16 x i8> %a, i32 3, i32 0)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vqshlq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqshl.s16 q0, q0, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vqshl.imm.v8i16(<8 x i16> %a, i32 4, i32 0)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vqshlq_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqshl.s32 q0, q0, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %a, i32 4, i32 0)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vqshlq_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqshl.u8 q0, q0, #0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <16 x i8> @llvm.arm.mve.vqshl.imm.v16i8(<16 x i8> %a, i32 0, i32 1)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vqshlq_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqshl.u16 q0, q0, #13
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vqshl.imm.v8i16(<8 x i16> %a, i32 13, i32 1)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vqshlq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqshl.u32 q0, q0, #6
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %a, i32 6, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vqshluq_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqshlu.s8 q0, q0, #5
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <16 x i8> @llvm.arm.mve.vqshlu.imm.v16i8(<16 x i8> %a, i32 5)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vqshluq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqshlu.s16 q0, q0, #5
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vqshlu.imm.v8i16(<8 x i16> %a, i32 5)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vqshluq_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqshlu.s32 q0, q0, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vqshlu.imm.v4i32(<4 x i32> %a, i32 4)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vrshrq_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrshr.s8 q0, q0, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <16 x i8> @llvm.arm.mve.vrshr.imm.v16i8(<16 x i8> %a, i32 4, i32 0)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vrshrq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrshr.s16 q0, q0, #12
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vrshr.imm.v8i16(<8 x i16> %a, i32 12, i32 0)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vrshrq_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrshr.s32 q0, q0, #30
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vrshr.imm.v4i32(<4 x i32> %a, i32 30, i32 0)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vrshrq_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrshr.u8 q0, q0, #1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <16 x i8> @llvm.arm.mve.vrshr.imm.v16i8(<16 x i8> %a, i32 1, i32 1)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vrshrq_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrshr.u16 q0, q0, #15
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vrshr.imm.v8i16(<8 x i16> %a, i32 15, i32 1)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vrshrq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrshr.u32 q0, q0, #20
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vrshr.imm.v4i32(<4 x i32> %a, i32 20, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqshlq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vqshlq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqshlt.s8 q0, q1, #6
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.vqshl.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 6, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqshlq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vqshlq_m_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqshlt.s16 q0, q1, #13
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vqshl.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 13, i32 0, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqshlq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vqshlq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqshlt.s32 q0, q1, #14
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 14, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqshlq_m_n_u8(<16 x i8> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vqshlq_m_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqshlt.u8 q0, q1, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.vqshl.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 4, i32 1, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqshlq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vqshlq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqshlt.u16 q0, q1, #9
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vqshl.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 9, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqshlq_m_n_u32(<4 x i32> %inactive, <4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vqshlq_m_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqshlt.u32 q0, q1, #25
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 25, i32 1, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqshluq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vqshluq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqshlut.s8 q0, q1, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.vqshlu.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 2, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqshluq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vqshluq_m_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqshlut.s16 q0, q1, #12
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vqshlu.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 12, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqshluq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vqshluq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqshlut.s32 q0, q1, #24
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vqshlu.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 24, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrshrq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.s8 q0, q1, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.vrshr.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 2, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrshrq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_m_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.s16 q0, q1, #11
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vrshr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 11, i32 0, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrshrq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.s32 q0, q1, #24
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vrshr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 24, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrshrq_m_n_u8(<16 x i8> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_m_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.u8 q0, q1, #7
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.vrshr.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 7, i32 1, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrshrq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.u16 q0, q1, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vrshr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 4, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrshrq_m_n_u32(<4 x i32> %inactive, <4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_m_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.u32 q0, q1, #27
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vrshr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 27, i32 1, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrshrq_x_n_s8(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_x_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.s8 q0, q0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.vrshr.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 3, i32 0, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrshrq_x_n_s16(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_x_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.s16 q0, q0, #12
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vrshr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 12, i32 0, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrshrq_x_n_s32(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_x_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.s32 q0, q0, #20
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vrshr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 20, i32 0, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrshrq_x_n_u8(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_x_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.u8 q0, q0, #1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.vrshr.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 1, i32 1, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrshrq_x_n_u16(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_x_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.u16 q0, q0, #13
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vrshr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 13, i32 1, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrshrq_x_n_u32(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vrshrq_x_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrshrt.u32 q0, q0, #6
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vrshr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 6, i32 1, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_n_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vshllbq_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllb.s8 q0, q0, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> %a, i32 2, i32 0, i32 0)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_n_s8_lanewidth(<16 x i8> %a) {
+; CHECK-LABEL: test_vshllbq_n_s8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllb.s8 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> %a, i32 8, i32 0, i32 0)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_n_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vshllbq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllb.s16 q0, q0, #13
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> %a, i32 13, i32 0, i32 0)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_n_s16_lanewidth(<8 x i16> %a) {
+; CHECK-LABEL: test_vshllbq_n_s16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllb.s16 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> %a, i32 16, i32 0, i32 0)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_n_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vshllbq_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllb.u8 q0, q0, #5
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> %a, i32 5, i32 1, i32 0)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_n_u8_lanewidth(<16 x i8> %a) {
+; CHECK-LABEL: test_vshllbq_n_u8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllb.u8 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> %a, i32 8, i32 1, i32 0)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_n_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vshllbq_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllb.u16 q0, q0, #6
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> %a, i32 6, i32 1, i32 0)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_n_u16_lanewidth(<8 x i16> %a) {
+; CHECK-LABEL: test_vshllbq_n_u16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllb.u16 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> %a, i32 16, i32 1, i32 0)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_n_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vshlltq_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllt.s8 q0, q0, #7
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> %a, i32 7, i32 0, i32 1)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_n_s8_lanewidth(<16 x i8> %a) {
+; CHECK-LABEL: test_vshlltq_n_s8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllt.s8 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> %a, i32 8, i32 0, i32 1)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_n_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vshlltq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllt.s16 q0, q0, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> %a, i32 2, i32 0, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_n_s16_lanewidth(<8 x i16> %a) {
+; CHECK-LABEL: test_vshlltq_n_s16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllt.s16 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> %a, i32 16, i32 0, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_n_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vshlltq_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllt.u8 q0, q0, #7
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> %a, i32 7, i32 1, i32 1)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_n_u8_lanewidth(<16 x i8> %a) {
+; CHECK-LABEL: test_vshlltq_n_u8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllt.u8 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8> %a, i32 8, i32 1, i32 1)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_n_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vshlltq_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllt.u16 q0, q0, #14
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> %a, i32 14, i32 1, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_n_u16_lanewidth(<8 x i16> %a) {
+; CHECK-LABEL: test_vshlltq_n_u16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vshllt.u16 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16> %a, i32 16, i32 1, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_m_n_s8(<8 x i16> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.s8 q0, q1, #6
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 6, i32 0, i32 0, <16 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_m_n_s8_lanewidth(<8 x i16> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_m_n_s8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.s8 q0, q1, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 8, i32 0, i32 0, <16 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_m_n_s16(<4 x i32> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_m_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.s16 q0, q1, #10
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 10, i32 0, i32 0, <8 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_m_n_s16_lanewidth(<4 x i32> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_m_n_s16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.s16 q0, q1, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 16, i32 0, i32 0, <8 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_m_n_u8(<8 x i16> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_m_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.u8 q0, q1, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 3, i32 1, i32 0, <16 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_m_n_u8_lanewidth(<8 x i16> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_m_n_u8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.u8 q0, q1, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 8, i32 1, i32 0, <16 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_m_n_u16(<4 x i32> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.u16 q0, q1, #14
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 14, i32 1, i32 0, <8 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_m_n_u16_lanewidth(<4 x i32> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_m_n_u16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.u16 q0, q1, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 16, i32 1, i32 0, <8 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_m_n_s8(<8 x i16> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.s8 q0, q1, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 4, i32 0, i32 1, <16 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_m_n_s8_lanewidth(<8 x i16> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_m_n_s8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.s8 q0, q1, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 8, i32 0, i32 1, <16 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_m_n_s16(<4 x i32> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_m_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.s16 q0, q1, #12
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 12, i32 0, i32 1, <8 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_m_n_s16_lanewidth(<4 x i32> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_m_n_s16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.s16 q0, q1, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 16, i32 0, i32 1, <8 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_m_n_u8(<8 x i16> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_m_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.u8 q0, q1, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 2, i32 1, i32 1, <16 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_m_n_u8_lanewidth(<8 x i16> %inactive, <16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_m_n_u8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.u8 q0, q1, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 8, i32 1, i32 1, <16 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_m_n_u16(<4 x i32> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.u16 q0, q1, #9
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 9, i32 1, i32 1, <8 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_m_n_u16_lanewidth(<4 x i32> %inactive, <8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_m_n_u16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.u16 q0, q1, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 16, i32 1, i32 1, <8 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_x_n_s8(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_x_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.s8 q0, q0, #1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 1, i32 0, i32 0, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_x_n_s8_lanewidth(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_x_n_s8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.s8 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 8, i32 0, i32 0, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_x_n_s16(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_x_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.s16 q0, q0, #10
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 10, i32 0, i32 0, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_x_n_s16_lanewidth(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_x_n_s16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.s16 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 16, i32 0, i32 0, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_x_n_u8(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_x_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.u8 q0, q0, #6
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 6, i32 1, i32 0, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshllbq_x_n_u8_lanewidth(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_x_n_u8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.u8 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 8, i32 1, i32 0, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_x_n_u16(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_x_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.u16 q0, q0, #10
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 10, i32 1, i32 0, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshllbq_x_n_u16_lanewidth(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshllbq_x_n_u16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshllbt.u16 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 16, i32 1, i32 0, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_x_n_s8(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_x_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.s8 q0, q0, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 2, i32 0, i32 1, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_x_n_s8_lanewidth(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_x_n_s8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.s8 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 8, i32 0, i32 1, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_x_n_s16(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_x_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.s16 q0, q0, #6
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 6, i32 0, i32 1, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_x_n_s16_lanewidth(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_x_n_s16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.s16 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 16, i32 0, i32 1, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_x_n_u8(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_x_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.u8 q0, q0, #5
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 5, i32 1, i32 1, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vshlltq_x_n_u8_lanewidth(<16 x i8> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_x_n_u8_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.u8 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, i32 8, i32 1, i32 1, <16 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_x_n_u16(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_x_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.u16 q0, q0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 3, i32 1, i32 1, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vshlltq_x_n_u16_lanewidth(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vshlltq_x_n_u16_lanewidth:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vshlltt.u16 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, i32 16, i32 1, i32 1, <8 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
@@ -396,3 +1448,29 @@ declare <4 x i32> @llvm.arm.mve.shl.imm.predicated.v4i32.v4i1(<4 x i32>, i32, <4
 declare <16 x i8> @llvm.arm.mve.shr.imm.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>, <16 x i8>)
 declare <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>, <8 x i16>)
 declare <4 x i32> @llvm.arm.mve.shr.imm.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>, <4 x i32>)
+
+declare <16 x i8> @llvm.arm.mve.vqshl.imm.v16i8(<16 x i8>, i32, i32)
+declare <8 x i16> @llvm.arm.mve.vqshl.imm.v8i16(<8 x i16>, i32, i32)
+declare <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32>, i32, i32)
+declare <16 x i8> @llvm.arm.mve.vqshl.imm.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>, <16 x i8>)
+declare <8 x i16> @llvm.arm.mve.vqshl.imm.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>, <8 x i16>)
+declare <4 x i32> @llvm.arm.mve.vqshl.imm.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>, <4 x i32>)
+
+declare <16 x i8> @llvm.arm.mve.vqshlu.imm.v16i8(<16 x i8>, i32)
+declare <8 x i16> @llvm.arm.mve.vqshlu.imm.v8i16(<8 x i16>, i32)
+declare <4 x i32> @llvm.arm.mve.vqshlu.imm.v4i32(<4 x i32>, i32)
+declare <16 x i8> @llvm.arm.mve.vqshlu.imm.predicated.v16i8.v16i1(<16 x i8>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i16> @llvm.arm.mve.vqshlu.imm.predicated.v8i16.v8i1(<8 x i16>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i32> @llvm.arm.mve.vqshlu.imm.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>, <4 x i32>)
+
+declare <16 x i8> @llvm.arm.mve.vrshr.imm.v16i8(<16 x i8>, i32, i32)
+declare <8 x i16> @llvm.arm.mve.vrshr.imm.v8i16(<8 x i16>, i32, i32)
+declare <4 x i32> @llvm.arm.mve.vrshr.imm.v4i32(<4 x i32>, i32, i32)
+declare <16 x i8> @llvm.arm.mve.vrshr.imm.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>, <16 x i8>)
+declare <8 x i16> @llvm.arm.mve.vrshr.imm.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>, <8 x i16>)
+declare <4 x i32> @llvm.arm.mve.vrshr.imm.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>, <4 x i32>)
+
+declare <8 x i16> @llvm.arm.mve.vshll.imm.v8i16.v16i8(<16 x i8>, i32, i32, i32)
+declare <4 x i32> @llvm.arm.mve.vshll.imm.v4i32.v8i16(<8 x i16>, i32, i32, i32)
+declare <8 x i16> @llvm.arm.mve.vshll.imm.predicated.v8i16.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>, <8 x i16>)
+declare <4 x i32> @llvm.arm.mve.vshll.imm.predicated.v4i32.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>, <4 x i32>)


        


More information about the cfe-commits mailing list