[llvm] c32af44 - [ARM,MVE] Add the vmovnbq,vmovntq intrinsic family.

Tue Feb 18 01:36:48 PST 2020

Author: Simon Tatham
Date: 2020-02-18T09:34:50Z
New Revision: c32af4447f79f5e7f246917fe1c3f58b2f6fc2a6

URL: https://github.com/llvm/llvm-project/commit/c32af4447f79f5e7f246917fe1c3f58b2f6fc2a6
DIFF: https://github.com/llvm/llvm-project/commit/c32af4447f79f5e7f246917fe1c3f58b2f6fc2a6.diff

LOG: [ARM,MVE] Add the vmovnbq,vmovntq intrinsic family.

Summary:
These are in some sense the inverse of vmovl[bt]q: they take a vector
of n wide elements and truncate each to half its width. So they only
write half a vector's worth of output data, and therefore they also
take an 'inactive' parameter to provide the other half of the data in
the output vector. So vmovnb overwrites the even lanes of 'inactive'
with the narrowed values from the main input, and vmovnt overwrites
the odd lanes.

LLVM had existing codegen which generates these MVE instructions in
response to IR that takes two vectors of wide elements, or two vectors
of narrow ones. But in this case, we have one vector of each. So my
clang codegen strategy is to narrow the input vector of wide elements
by simply reinterpreting it as the output type, and then we have two
narrow vectors and can represent the operation as a vector shuffle
that interleaves lanes from both of them.

Even so, not all the cases I needed ended up being selected as a
single MVE instruction, so I've added a couple more patterns that spot
combinations of the 'MVEvmovn' and 'ARMvrev32' SDNodes which can be
generated as a VMOVN instruction with operands swapped.

This commit adds the unpredicated forms only.

Reviewers: dmgreen, miyuki, MarkMurrayARM, ostannard

Reviewed By: dmgreen

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D74337

Added: 
    clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll

Modified: 
    clang/include/clang/Basic/arm_mve.td
    clang/include/clang/Basic/arm_mve_defs.td
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/utils/TableGen/MveEmitter.cpp
    llvm/lib/Target/ARM/ARMInstrMVE.td

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 55ddfc22aa3d..3a6b63199e39 100644

--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -427,6 +427,14 @@ let params = [s8, u8, s16, u16] in {
     (extend (unzip $a, 1), DblVector, (unsignedflag Scalar))>;
 }
 
+let params = [s16, u16, s32, u32] in {
+  def vmovnbq: Intrinsic<HalfVector, (args HalfVector:$inactive, Vector:$a),
+    (trunc (zip $a, (vreinterpret (vrev $inactive, (bitsize Scalar)), Vector)),
+           HalfVector)>;
+  def vmovntq: Intrinsic<HalfVector, (args HalfVector:$inactive, Vector:$a),
+    (trunc (zip (vreinterpret $inactive, Vector), $a), HalfVector)>;
+}
+
 let params = T.Float in {
   def vrndq: Intrinsic<Vector, (args Vector:$a),
       (IRIntBase<"trunc", [Vector]> $a)>;

diff  --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index f6816cdf45c9..7f8f717e8163 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -131,6 +131,7 @@ def vrev: CGHelperFn<"ARMMVEVectorElementReverse"> {
 def unzip: CGHelperFn<"VectorUnzip"> {
   let special_params = [IRBuilderIntParam<1, "bool">];
 }
+def zip: CGHelperFn<"VectorZip">;
 
 // Helper for making boolean flags in IR
 def i1: IRBuilderBase {
@@ -187,6 +188,10 @@ def seq;
 // and 0 for a signed (or floating) one.
 def unsignedflag;
 
+// 'bitsize' also takes a scalar type, and expands into an integer
+// constant giving its size in bits.
+def bitsize;
+
 // If you put CustomCodegen<"foo"> in an intrinsic's codegen field, it
 // indicates that the IR generation for that intrinsic is done by handwritten
 // C++ and not autogenerated at all. The effect in the MVE builtin codegen

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b30a79a0bf10..401c4d8e0539 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7067,6 +7067,19 @@ static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd)
                                      Indices);
 }
 
+static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
+                              llvm::Value *V1) {
+  // Make a shufflevector that interleaves two vectors element by element.
+  assert(V0->getType() == V1->getType() && "Can't zip 
diff erent vector types");
+  SmallVector<uint32_t, 16> Indices;
+  unsigned InputElements = V0->getType()->getVectorNumElements();
+  for (unsigned i = 0; i < InputElements; i++) {
+    Indices.push_back(i);
+    Indices.push_back(i + InputElements);
+  }
+  return Builder.CreateShuffleVector(V0, V1, Indices);
+}
+
 template<unsigned HighBit, unsigned OtherBits>
 static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
   // MVE-specific helper function to make a vector splat of a constant such as

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c b/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
new file mode 100644
index 000000000000..5d157de0feb8
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
@@ -0,0 +1,199 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=LE %s
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=BE %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=LE %s
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=BE %s
+
+#include <arm_mve.h>
+
+// LE-LABEL: @test_vmovnbq_s16(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// LE-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// LE-NEXT:    ret <16 x i8> [[TMP3]]
+//
+// BE-LABEL: @test_vmovnbq_s16(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// BE-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[TMP0]])
+// BE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// BE-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// BE-NEXT:    ret <16 x i8> [[TMP3]]
+//
+int8x16_t test_vmovnbq_s16(int8x16_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovnbq_s32(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// LE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
+// LE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// LE-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// LE-NEXT:    ret <8 x i16> [[TMP3]]
+//
+// BE-LABEL: @test_vmovnbq_s32(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// BE-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[TMP0]])
+// BE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// BE-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// BE-NEXT:    ret <8 x i16> [[TMP3]]
+//
+int16x8_t test_vmovnbq_s32(int16x8_t a, int32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovnbq_u16(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// LE-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// LE-NEXT:    ret <16 x i8> [[TMP3]]
+//
+// BE-LABEL: @test_vmovnbq_u16(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// BE-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[TMP0]])
+// BE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// BE-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// BE-NEXT:    ret <16 x i8> [[TMP3]]
+//
+uint8x16_t test_vmovnbq_u16(uint8x16_t a, uint16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovnbq_u32(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// LE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
+// LE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// LE-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// LE-NEXT:    ret <8 x i16> [[TMP3]]
+//
+// BE-LABEL: @test_vmovnbq_u32(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// BE-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[TMP0]])
+// BE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// BE-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// BE-NEXT:    ret <8 x i16> [[TMP3]]
+//
+uint16x8_t test_vmovnbq_u32(uint16x8_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovntq_s16(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x i16>
+// LE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// LE-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// LE-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// BE-LABEL: @test_vmovntq_s16(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[A:%.*]])
+// BE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// BE-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// BE-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vmovntq_s16(int8x16_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovntq_s32(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <4 x i32>
+// LE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// LE-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// LE-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// BE-LABEL: @test_vmovntq_s32(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[A:%.*]])
+// BE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// BE-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// BE-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmovntq_s32(int16x8_t a, int32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovntq_u16(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x i16>
+// LE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// LE-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// LE-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// BE-LABEL: @test_vmovntq_u16(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[A:%.*]])
+// BE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// BE-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// BE-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vmovntq_u16(uint8x16_t a, uint16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovntq_u32(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <4 x i32>
+// LE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// LE-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// LE-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// BE-LABEL: @test_vmovntq_u32(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[A:%.*]])
+// BE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// BE-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// BE-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmovntq_u32(uint16x8_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_u32(a, b);
+#endif /* POLYMORPHIC */
+}

diff  --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index f6072336a701..6a34f1ff1b24 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1188,6 +1188,18 @@ Result::Ptr MveEmitter::getCodeForDag(DagInit *D, const Result::Scope &Scope,
     } else {
       PrintFatalError("unsignedflag's argument should be a scalar type");
     }
+  } else if (Op->getName() == "bitsize") {
+    if (D->getNumArgs() != 1)
+      PrintFatalError("bitsize should have exactly one argument");
+    Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
+    if (!TypeRec->isSubClassOf("Type"))
+      PrintFatalError("bitsize's argument should be a type");
+    if (const auto *ST = dyn_cast<ScalarType>(getType(TypeRec, Param))) {
+      return std::make_shared<IntLiteralResult>(getScalarType("u32"),
+                                                ST->sizeInBits());
+    } else {
+      PrintFatalError("bitsize's argument should be a scalar type");
+    }
   } else {
     std::vector<Result::Ptr> Args;
     for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i)

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 538a120f9caa..769212f6aff2 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4322,8 +4322,16 @@ let Predicates = [HasMVEInt] in {
             (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
   def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))),
             (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+
+  def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qm),
+                             (v8i16 (ARMvrev32 MQPR:$Qd_src)), (i32 1))),
+            (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+  def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qm),
+                             (v16i8 (ARMvrev16 MQPR:$Qd_src)), (i32 1))),
+            (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
 }
 
+
 class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
                   list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll
new file mode 100644
index 000000000000..f16e83a45b6f
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=LE %s
+; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=BE %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_s16(<16 x i8> %a, <8 x i16> %b) {
+; LE-LABEL: test_vmovnbq_s16:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnb.i16 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovnbq_s16:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.16 q2, q1
+; BE-NEXT:    vrev64.8 q1, q0
+; BE-NEXT:    vmovnb.i16 q1, q2
+; BE-NEXT:    vrev64.8 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)
+  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %3 = trunc <16 x i16> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_s32(<8 x i16> %a, <4 x i32> %b) {
+; LE-LABEL: test_vmovnbq_s32:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnb.i32 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovnbq_s32:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.32 q2, q1
+; BE-NEXT:    vrev64.16 q1, q0
+; BE-NEXT:    vmovnb.i32 q1, q2
+; BE-NEXT:    vrev64.16 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)
+  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %3 = trunc <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_u16(<16 x i8> %a, <8 x i16> %b) {
+; LE-LABEL: test_vmovnbq_u16:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnb.i16 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovnbq_u16:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.16 q2, q1
+; BE-NEXT:    vrev64.8 q1, q0
+; BE-NEXT:    vmovnb.i16 q1, q2
+; BE-NEXT:    vrev64.8 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)
+  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %3 = trunc <16 x i16> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_u32(<8 x i16> %a, <4 x i32> %b) {
+; LE-LABEL: test_vmovnbq_u32:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnb.i32 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovnbq_u32:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.32 q2, q1
+; BE-NEXT:    vrev64.16 q1, q0
+; BE-NEXT:    vmovnb.i32 q1, q2
+; BE-NEXT:    vrev64.16 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)
+  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %3 = trunc <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_s16(<16 x i8> %a, <8 x i16> %b) {
+; LE-LABEL: test_vmovntq_s16:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnt.i16 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovntq_s16:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.16 q2, q1
+; BE-NEXT:    vrev64.8 q1, q0
+; BE-NEXT:    vmovnt.i16 q1, q2
+; BE-NEXT:    vrev64.8 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
+  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_s32(<8 x i16> %a, <4 x i32> %b) {
+; LE-LABEL: test_vmovntq_s32:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnt.i32 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovntq_s32:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.32 q2, q1
+; BE-NEXT:    vrev64.16 q1, q0
+; BE-NEXT:    vmovnt.i32 q1, q2
+; BE-NEXT:    vrev64.16 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
+  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_u16(<16 x i8> %a, <8 x i16> %b) {
+; LE-LABEL: test_vmovntq_u16:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnt.i16 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovntq_u16:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.16 q2, q1
+; BE-NEXT:    vrev64.8 q1, q0
+; BE-NEXT:    vmovnt.i16 q1, q2
+; BE-NEXT:    vrev64.8 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
+  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_u32(<8 x i16> %a, <4 x i32> %b) {
+; LE-LABEL: test_vmovntq_u32:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnt.i32 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovntq_u32:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.32 q2, q1
+; BE-NEXT:    vrev64.16 q1, q0
+; BE-NEXT:    vmovnt.i32 q1, q2
+; BE-NEXT:    vrev64.16 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
+  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8>)
+declare <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16>)