[llvm] cd58fb6 - [ARM] Avoid pointless vrev of element-wise vmov

John Brawn via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 3 09:37:29 PDT 2020


Author: John Brawn
Date: 2020-04-03T17:36:50+01:00
New Revision: cd58fb632533e9bb87d401b734fcfec62012276d

URL: https://github.com/llvm/llvm-project/commit/cd58fb632533e9bb87d401b734fcfec62012276d
DIFF: https://github.com/llvm/llvm-project/commit/cd58fb632533e9bb87d401b734fcfec62012276d.diff

LOG: [ARM] Avoid pointless vrev of element-wise vmov

If we have an element-wise vmov immediate instruction then a subsequent vrev
with width greater or equal to the vmov element width, then that vrev won't do
anything. Add a DAG combine to convert bitcasts that would become such vrevs
into vector_reg_casts instead.

Differential Revision: https://reviews.llvm.org/D76514

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/test/CodeGen/ARM/vmov.ll
    llvm/test/CodeGen/Thumb2/mve-masked-load.ll
    llvm/test/CodeGen/Thumb2/mve-vmovimm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a0f553b7eb18..bfe475723cae 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -943,6 +943,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::INTRINSIC_VOID);
     setTargetDAGCombine(ISD::VECREDUCE_ADD);
     setTargetDAGCombine(ISD::ADD);
+    setTargetDAGCombine(ISD::BITCAST);
   }
 
   if (!Subtarget->hasFP64()) {
@@ -9223,9 +9224,10 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
       N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
       N->getExtensionType(), N->isExpandingLoad());
   SDValue Combo = NewLoad;
-  if (!PassThru.isUndef() &&
-      (PassThru.getOpcode() != ISD::BITCAST ||
-       !isZeroVector(PassThru->getOperand(0))))
+  bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
+                             PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
+                            isZeroVector(PassThru->getOperand(0));
+  if (!PassThru.isUndef() && !PassThruIsCastZero)
     Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
   return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
 }
@@ -15211,6 +15213,28 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   return Res;
 }
 
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Src = N->getOperand(0);
+
+  // We may have a bitcast of something that has already had this bitcast
+  // combine performed on it, so skip past any VECTOR_REG_CASTs.
+  while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
+    Src = Src.getOperand(0);
+
+  // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
+  // would be generated is at least the width of the element type.
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = N->getValueType(0);
+  if ((Src.getOpcode() == ARMISD::VMOVIMM ||
+       Src.getOpcode() == ARMISD::VMVNIMM ||
+       Src.getOpcode() == ARMISD::VMOVFPIMM) &&
+      SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
+      DAG.getDataLayout().isBigEndian())
+    return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
+
+  return SDValue();
+}
+
 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
@@ -15264,6 +15288,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformVLDCombine(N, DCI);
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
+  case ISD::BITCAST:
+    return PerformBITCASTCombine(N, DCI.DAG);
   case ARMISD::PREDICATE_CAST:
     return PerformPREDICATE_CASTCombine(N, DCI);
   case ARMISD::VECTOR_REG_CAST:

diff  --git a/llvm/test/CodeGen/ARM/vmov.ll b/llvm/test/CodeGen/ARM/vmov.ll
index 0341448f9a77..751fd2ff557a 100644
--- a/llvm/test/CodeGen/ARM/vmov.ll
+++ b/llvm/test/CodeGen/ARM/vmov.ll
@@ -1,242 +1,140 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck --check-prefixes=CHECK,CHECK-LE %s
-; RUN: llc -mtriple=armeb-eabi -mattr=+neon %s -o - | FileCheck --check-prefixes=CHECK,CHECK-BE %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon,+fullfp16 %s -o - | FileCheck --check-prefixes=CHECK,CHECK-LE %s
+; RUN: llc -mtriple=armeb-eabi -mattr=+neon,+fullfp16 %s -o - | FileCheck --check-prefixes=CHECK,CHECK-BE %s
 
 define arm_aapcs_vfpcc <8 x i8> @v_movi8() nounwind {
-; CHECK-LE-LABEL: v_movi8:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i8 d0, #0x8
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movi8:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i8 d16, #0x8
-; CHECK-BE-NEXT:    vrev64.8 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movi8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 d0, #0x8
+; CHECK-NEXT:    mov pc, lr
 	ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
 }
 
 define arm_aapcs_vfpcc <4 x i16> @v_movi16a() nounwind {
-; CHECK-LE-LABEL: v_movi16a:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i16 d0, #0x10
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movi16a:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i16 d16, #0x10
-; CHECK-BE-NEXT:    vrev64.16 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movi16a:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 d0, #0x10
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 >
 }
 
 define arm_aapcs_vfpcc <4 x i16> @v_movi16b() nounwind {
-; CHECK-LE-LABEL: v_movi16b:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i16 d0, #0x1000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movi16b:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i16 d16, #0x1000
-; CHECK-BE-NEXT:    vrev64.16 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movi16b:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 d0, #0x1000
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 >
 }
 
 define arm_aapcs_vfpcc <4 x i16> @v_mvni16a() nounwind {
-; CHECK-LE-LABEL: v_mvni16a:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmvn.i16 d0, #0x10
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_mvni16a:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmvn.i16 d16, #0x10
-; CHECK-BE-NEXT:    vrev64.16 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_mvni16a:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 d0, #0x10
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
 }
 
 define arm_aapcs_vfpcc <4 x i16> @v_mvni16b() nounwind {
-; CHECK-LE-LABEL: v_mvni16b:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmvn.i16 d0, #0x1000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_mvni16b:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmvn.i16 d16, #0x1000
-; CHECK-BE-NEXT:    vrev64.16 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_mvni16b:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 d0, #0x1000
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_movi32a() nounwind {
-; CHECK-LE-LABEL: v_movi32a:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 d0, #0x20
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movi32a:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 d16, #0x20
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movi32a:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d0, #0x20
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 32, i32 32 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_movi32b() nounwind {
-; CHECK-LE-LABEL: v_movi32b:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 d0, #0x2000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movi32b:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 d16, #0x2000
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movi32b:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d0, #0x2000
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 8192, i32 8192 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_movi32c() nounwind {
-; CHECK-LE-LABEL: v_movi32c:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 d0, #0x200000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movi32c:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 d16, #0x200000
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movi32c:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d0, #0x200000
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 2097152, i32 2097152 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_movi32d() nounwind {
-; CHECK-LE-LABEL: v_movi32d:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 d0, #0x20000000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movi32d:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 d16, #0x20000000
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movi32d:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d0, #0x20000000
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 536870912, i32 536870912 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_movi32e() nounwind {
-; CHECK-LE-LABEL: v_movi32e:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 d0, #0x20ff
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movi32e:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 d16, #0x20ff
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movi32e:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d0, #0x20ff
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 8447, i32 8447 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_movi32f() nounwind {
-; CHECK-LE-LABEL: v_movi32f:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 d0, #0x20ffff
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movi32f:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 d16, #0x20ffff
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movi32f:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d0, #0x20ffff
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 2162687, i32 2162687 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_mvni32a() nounwind {
-; CHECK-LE-LABEL: v_mvni32a:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmvn.i32 d0, #0x20
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_mvni32a:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmvn.i32 d16, #0x20
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_mvni32a:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 d0, #0x20
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 4294967263, i32 4294967263 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_mvni32b() nounwind {
-; CHECK-LE-LABEL: v_mvni32b:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmvn.i32 d0, #0x2000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_mvni32b:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmvn.i32 d16, #0x2000
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_mvni32b:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 d0, #0x2000
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 4294959103, i32 4294959103 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_mvni32c() nounwind {
-; CHECK-LE-LABEL: v_mvni32c:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmvn.i32 d0, #0x200000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_mvni32c:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmvn.i32 d16, #0x200000
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_mvni32c:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 d0, #0x200000
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 4292870143, i32 4292870143 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_mvni32d() nounwind {
-; CHECK-LE-LABEL: v_mvni32d:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmvn.i32 d0, #0x20000000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_mvni32d:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmvn.i32 d16, #0x20000000
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_mvni32d:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 d0, #0x20000000
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 3758096383, i32 3758096383 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_mvni32e() nounwind {
-; CHECK-LE-LABEL: v_mvni32e:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmvn.i32 d0, #0x20ff
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_mvni32e:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmvn.i32 d16, #0x20ff
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_mvni32e:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 d0, #0x20ff
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 4294958848, i32 4294958848 >
 }
 
 define arm_aapcs_vfpcc <2 x i32> @v_mvni32f() nounwind {
-; CHECK-LE-LABEL: v_mvni32f:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmvn.i32 d0, #0x20ffff
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_mvni32f:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmvn.i32 d16, #0x20ffff
-; CHECK-BE-NEXT:    vrev64.32 d0, d16
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_mvni32f:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 d0, #0x20ffff
+; CHECK-NEXT:    mov pc, lr
 	ret <2 x i32> < i32 4292804608, i32 4292804608 >
 }
 
@@ -249,128 +147,74 @@ define arm_aapcs_vfpcc <1 x i64> @v_movi64() nounwind {
 }
 
 define arm_aapcs_vfpcc <16 x i8> @v_movQi8() nounwind {
-; CHECK-LE-LABEL: v_movQi8:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i8 q0, #0x8
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movQi8:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i8 q8, #0x8
-; CHECK-BE-NEXT:    vrev64.8 q0, q8
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movQi8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 q0, #0x8
+; CHECK-NEXT:    mov pc, lr
 	ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
 }
 
 define arm_aapcs_vfpcc <8 x i16> @v_movQi16a() nounwind {
-; CHECK-LE-LABEL: v_movQi16a:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i16 q0, #0x10
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movQi16a:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i16 q8, #0x10
-; CHECK-BE-NEXT:    vrev64.16 q0, q8
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movQi16a:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 q0, #0x10
+; CHECK-NEXT:    mov pc, lr
 	ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
 }
 
 define arm_aapcs_vfpcc <8 x i16> @v_movQi16b() nounwind {
-; CHECK-LE-LABEL: v_movQi16b:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i16 q0, #0x1000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movQi16b:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i16 q8, #0x1000
-; CHECK-BE-NEXT:    vrev64.16 q0, q8
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movQi16b:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 q0, #0x1000
+; CHECK-NEXT:    mov pc, lr
 	ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 >
 }
 
 define arm_aapcs_vfpcc <4 x i32> @v_movQi32a() nounwind {
-; CHECK-LE-LABEL: v_movQi32a:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x20
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movQi32a:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 q8, #0x20
-; CHECK-BE-NEXT:    vrev64.32 q0, q8
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movQi32a:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q0, #0x20
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 >
 }
 
 define arm_aapcs_vfpcc <4 x i32> @v_movQi32b() nounwind {
-; CHECK-LE-LABEL: v_movQi32b:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x2000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movQi32b:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 q8, #0x2000
-; CHECK-BE-NEXT:    vrev64.32 q0, q8
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movQi32b:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q0, #0x2000
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 >
 }
 
 define arm_aapcs_vfpcc <4 x i32> @v_movQi32c() nounwind {
-; CHECK-LE-LABEL: v_movQi32c:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x200000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movQi32c:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 q8, #0x200000
-; CHECK-BE-NEXT:    vrev64.32 q0, q8
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movQi32c:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q0, #0x200000
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 >
 }
 
 define arm_aapcs_vfpcc <4 x i32> @v_movQi32d() nounwind {
-; CHECK-LE-LABEL: v_movQi32d:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x20000000
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movQi32d:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 q8, #0x20000000
-; CHECK-BE-NEXT:    vrev64.32 q0, q8
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movQi32d:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q0, #0x20000000
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 >
 }
 
 define arm_aapcs_vfpcc <4 x i32> @v_movQi32e() nounwind {
-; CHECK-LE-LABEL: v_movQi32e:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x20ff
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movQi32e:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 q8, #0x20ff
-; CHECK-BE-NEXT:    vrev64.32 q0, q8
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movQi32e:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q0, #0x20ff
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 >
 }
 
 define arm_aapcs_vfpcc <4 x i32> @v_movQi32f() nounwind {
-; CHECK-LE-LABEL: v_movQi32f:
-; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x20ffff
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_movQi32f:
-; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i32 q8, #0x20ffff
-; CHECK-BE-NEXT:    vrev64.32 q0, q8
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_movQi32f:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q0, #0x20ffff
+; CHECK-NEXT:    mov pc, lr
 	ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 >
 }
 
@@ -390,18 +234,11 @@ define arm_aapcs_vfpcc <2 x i64> @v_movQi64() nounwind {
 ; Check for correct assembler printing for immediate values.
 %struct.int8x8_t = type { <8 x i8> }
 define arm_aapcs_vfpcc void @vdupn128(%struct.int8x8_t* noalias nocapture sret %agg.result) nounwind {
-; CHECK-LE-LABEL: vdupn128:
-; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov.i8 d16, #0x80
-; CHECK-LE-NEXT:    vstr d16, [r0]
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: vdupn128:
-; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov.i8 d16, #0x80
-; CHECK-BE-NEXT:    vrev64.8 d16, d16
-; CHECK-BE-NEXT:    vstr d16, [r0]
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: vdupn128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i8 d16, #0x80
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = getelementptr inbounds %struct.int8x8_t, %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1]
   store <8 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>, <8 x i8>* %0, align 8
@@ -409,18 +246,11 @@ entry:
 }
 
 define arm_aapcs_vfpcc void @vdupnneg75(%struct.int8x8_t* noalias nocapture sret %agg.result) nounwind {
-; CHECK-LE-LABEL: vdupnneg75:
-; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov.i8 d16, #0xb5
-; CHECK-LE-NEXT:    vstr d16, [r0]
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: vdupnneg75:
-; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov.i8 d16, #0xb5
-; CHECK-BE-NEXT:    vrev64.8 d16, d16
-; CHECK-BE-NEXT:    vstr d16, [r0]
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: vdupnneg75:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i8 d16, #0xb5
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = getelementptr inbounds %struct.int8x8_t, %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1]
   store <8 x i8> <i8 -75, i8 -75, i8 -75, i8 -75, i8 -75, i8 -75, i8 -75, i8 -75>, <8 x i8>* %0, align 8
@@ -784,18 +614,11 @@ define arm_aapcs_vfpcc void @noTruncStore(<4 x i32>* %a, <4 x i16>* %b) nounwind
 ; Use vmov.f32 to materialize f32 immediate splats
 ; rdar://10437054
 define arm_aapcs_vfpcc void @v_mov_v2f32(<2 x float>* nocapture %p) nounwind {
-; CHECK-LE-LABEL: v_mov_v2f32:
-; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov.f32 d16, #-1.600000e+01
-; CHECK-LE-NEXT:    vstr d16, [r0]
-; CHECK-LE-NEXT:    mov pc, lr
-;
-; CHECK-BE-LABEL: v_mov_v2f32:
-; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov.f32 d16, #-1.600000e+01
-; CHECK-BE-NEXT:    vrev64.32 d16, d16
-; CHECK-BE-NEXT:    vstr d16, [r0]
-; CHECK-BE-NEXT:    mov pc, lr
+; CHECK-LABEL: v_mov_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 d16, #-1.600000e+01
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    mov pc, lr
 entry:
   store <2 x float> <float -1.600000e+01, float -1.600000e+01>, <2 x float>* %p, align 4
   ret void
@@ -811,7 +634,6 @@ define arm_aapcs_vfpcc void @v_mov_v4f32(<4 x float>* nocapture %p) nounwind {
 ; CHECK-BE-LABEL: v_mov_v4f32:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vmov.f32 q8, #3.100000e+01
-; CHECK-BE-NEXT:    vrev64.32 q8, q8
 ; CHECK-BE-NEXT:    vstmia r0, {d16, d17}
 ; CHECK-BE-NEXT:    mov pc, lr
 entry:
@@ -861,7 +683,6 @@ define arm_aapcs_vfpcc void @any_extend(<4 x i1> %x, <4 x i32> %y) nounwind ssp
 ; CHECK-BE-NEXT:    vmov.i16 d16, #0x1
 ; CHECK-BE-NEXT:    vrev64.32 d17, d0
 ; CHECK-BE-NEXT:    vrev64.32 q9, q1
-; CHECK-BE-NEXT:    vrev32.16 d16, d16
 ; CHECK-BE-NEXT:    vand d16, d17, d16
 ; CHECK-BE-NEXT:    vrev32.16 d16, d16
 ; CHECK-BE-NEXT:    vmovl.u16 q8, d16
@@ -878,4 +699,599 @@ entry:
   unreachable
 }
 
+define arm_aapcs_vfpcc void @v_movi8_sti8(i8* %p) {
+; CHECK-LABEL: v_movi8_sti8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 d16, #0x1
+; CHECK-NEXT:    vst1.8 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %p, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, i32 1)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi8_sti16(i8* %p) {
+; CHECK-LABEL: v_movi8_sti16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 d16, #0x1
+; CHECK-NEXT:    vst1.16 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <4 x i16>
+  call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %p, <4 x i16> %val, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi8_stf16(i8* %p) {
+; CHECK-LABEL: v_movi8_stf16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 d16, #0x1
+; CHECK-NEXT:    vst1.16 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <4 x half>
+  call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* %p, <4 x half> %val, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi8_sti32(i8* %p) {
+; CHECK-LABEL: v_movi8_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 d16, #0x1
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <2 x i32>
+  call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi8_stf32(i8* %p) {
+; CHECK-LABEL: v_movi8_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 d16, #0x1
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <2 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi8_sti64(i8* %p) {
+; CHECK-LABEL: v_movi8_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 d16, #0x1
+; CHECK-NEXT:    vst1.64 {d16}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <1 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi16_sti16(i8* %p) {
+; CHECK-LABEL: v_movi16_sti16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 d16, #0x1
+; CHECK-NEXT:    vst1.16 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %p, <4 x i16> <i16 1, i16 1, i16 1, i16 1>, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi16_stf16(i8* %p) {
+; CHECK-LABEL: v_movi16_stf16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 d16, #0x1
+; CHECK-NEXT:    vst1.16 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i16> <i16 1, i16 1, i16 1, i16 1> to <4 x half>
+  call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* %p, <4 x half> %val, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi16_sti32(i8* %p) {
+; CHECK-LABEL: v_movi16_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 d16, #0x1
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i16> <i16 1, i16 1, i16 1, i16 1> to <2 x i32>
+  call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi16_stf32(i8* %p) {
+; CHECK-LABEL: v_movi16_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 d16, #0x1
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i16> <i16 1, i16 1, i16 1, i16 1> to <2 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi16_sti64(i8* %p) {
+; CHECK-LABEL: v_movi16_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 d16, #0x1
+; CHECK-NEXT:    vst1.64 {d16}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i16> <i16 1, i16 1, i16 1, i16 1> to <1 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi32_sti32(i8* %p) {
+; CHECK-LABEL: v_movi32_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d16, #0x1
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> <i32 1, i32 1>, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi32_stf32(i8* %p) {
+; CHECK-LABEL: v_movi32_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d16, #0x1
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <2 x i32> <i32 1, i32 1> to <2 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi32_sti64(i8* %p) {
+; CHECK-LABEL: v_movi32_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d16, #0x1
+; CHECK-NEXT:    vst1.64 {d16}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <2 x i32> <i32 1, i32 1> to <1 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movf32_stf32(i8* %p) {
+; CHECK-LABEL: v_movf32_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.f32 d16, #1.000000e+00
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> <float 1.0, float 1.0>, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void at v_movf32_sti32(i8* %p) {
+; FIXME: We should use vmov.f32 instead of mov then vdup
+; CHECK-LABEL: v_movf32_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #1065353216
+; CHECK-NEXT:    vdup.32 d16, r1
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <2 x float> <float 1.0, float 1.0> to <2 x i32>
+  call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movf32_sti64(i8* %p) {
+; CHECK-LE-LABEL: v_movf32_sti64:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    mov r1, #1065353216
+; CHECK-LE-NEXT:    vdup.32 d16, r1
+; CHECK-LE-NEXT:    vst1.64 {d16}, [r0:64]
+; CHECK-LE-NEXT:    mov pc, lr
+;
+; FIXME: vrev is not needed here
+; CHECK-BE-LABEL: v_movf32_sti64:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    mov r1, #1065353216
+; CHECK-BE-NEXT:    vdup.32 d16, r1
+; CHECK-BE-NEXT:    vrev64.32 d16, d16
+; CHECK-BE-NEXT:    vst1.64 {d16}, [r0:64]
+; CHECK-BE-NEXT:    mov pc, lr
+  %val = bitcast <2 x float> <float 1.0, float 1.0> to <1 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movi64_sti64(i8* %p) {
+; CHECK-LABEL: v_movi64_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i64 d16, #0xff
+; CHECK-NEXT:    vst1.64 {d16}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> <i64 255>, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi8_sti8(i8* %p) {
+; CHECK-LABEL: v_movQi8_sti8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 q8, #0x1
+; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %p, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, i32 1)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi8_sti16(i8* %p) {
+; CHECK-LABEL: v_movQi8_sti16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 q8, #0x1
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <8 x i16>
+  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %p, <8 x i16> %val, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi8_stf16(i8* %p) {
+; CHECK-LABEL: v_movQi8_stf16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 q8, #0x1
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <8 x half>
+  call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* %p, <8 x half> %val, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi8_sti32(i8* %p) {
+; CHECK-LABEL: v_movQi8_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 q8, #0x1
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <4 x i32>
+  call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi8_stf32(i8* %p) {
+; CHECK-LABEL: v_movQi8_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 q8, #0x1
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <4 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi8_sti64(i8* %p) {
+; CHECK-LABEL: v_movQi8_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 q8, #0x1
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> to <2 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi16_sti16(i8* %p) {
+; CHECK-LABEL: v_movQi16_sti16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 q8, #0x1
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %p, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi16_stf16(i8* %p) {
+; CHECK-LABEL: v_movQi16_stf16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 q8, #0x1
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> to <8 x half>
+  call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* %p, <8 x half> %val, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi16_sti32(i8* %p) {
+; CHECK-LABEL: v_movQi16_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 q8, #0x1
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> to <4 x i32>
+  call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi16_stf32(i8* %p) {
+; CHECK-LABEL: v_movQi16_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 q8, #0x1
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> to <4 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi16_sti64(i8* %p) {
+; CHECK-LABEL: v_movQi16_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i16 q8, #0x1
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> to <2 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi32_sti32(i8* %p) {
+; CHECK-LABEL: v_movQi32_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q8, #0x1
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi32_stf32(i8* %p) {
+; CHECK-LABEL: v_movQi32_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q8, #0x1
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i32> <i32 1, i32 1, i32 1, i32 1> to <4 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi32_sti64(i8* %p) {
+; CHECK-LABEL: v_movQi32_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q8, #0x1
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i32> <i32 1, i32 1, i32 1, i32 1> to <2 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQf32_stf32(i8* %p) {
+; CHECK-LABEL: v_movQf32_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.f32 q8, #1.000000e+00
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQf32_sti32(i8* %p) {
+; FIXME: We should use vmov.f32 instead of mov then vdup
+; CHECK-LABEL: v_movQf32_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #1065353216
+; CHECK-NEXT:    vdup.32 q8, r1
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> to <4 x i32>
+  call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQf32_sti64(i8* %p) {
+; CHECK-LE-LABEL: v_movQf32_sti64:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    mov r1, #1065353216
+; CHECK-LE-NEXT:    vdup.32 q8, r1
+; CHECK-LE-NEXT:    vst1.64 {d16, d17}, [r0:64]
+; CHECK-LE-NEXT:    mov pc, lr
+;
+; FIXME: vrev is not needed here
+; CHECK-BE-LABEL: v_movQf32_sti64:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    mov r1, #1065353216
+; CHECK-BE-NEXT:    vdup.32 q8, r1
+; CHECK-BE-NEXT:    vrev64.32 q8, q8
+; CHECK-BE-NEXT:    vst1.64 {d16, d17}, [r0:64]
+; CHECK-BE-NEXT:    mov pc, lr
+  %val = bitcast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> to <2 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_movQi64_sti64(i8* %p) {
+; CHECK-LABEL: v_movQi64_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i64 q8, #0xff
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> <i64 255, i64 255>, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvni16_sti16(i8* %p) {
+; CHECK-LABEL: v_mvni16_sti16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 d16, #0xfe
+; CHECK-NEXT:    vst1.16 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %p, <4 x i16> <i16 65281, i16 65281, i16 65281, i16 65281>, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvni16_stf16(i8* %p) {
+; CHECK-LABEL: v_mvni16_stf16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 d16, #0xfe
+; CHECK-NEXT:    vst1.16 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i16> <i16 65281, i16 65281, i16 65281, i16 65281> to <4 x half>
+  call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* %p, <4 x half> %val, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvni16_sti32(i8* %p) {
+; CHECK-LABEL: v_mvni16_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 d16, #0xfe
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i16> <i16 65281, i16 65281, i16 65281, i16 65281> to <2 x i32>
+  call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvni16_stf32(i8* %p) {
+; CHECK-LABEL: v_mvni16_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 d16, #0xfe
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i16> <i16 65281, i16 65281, i16 65281, i16 65281> to <2 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvni16_sti64(i8* %p) {
+; CHECK-LABEL: v_mvni16_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 d16, #0xfe
+; CHECK-NEXT:    vst1.64 {d16}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i16> <i16 65281, i16 65281, i16 65281, i16 65281> to <1 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvni32_sti32(i8* %p) {
+; CHECK-LABEL: v_mvni32_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 d16, #0xfe
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> <i32 4294967041, i32 4294967041>, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvni32_stf32(i8* %p) {
+; CHECK-LABEL: v_mvni32_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 d16, #0xfe
+; CHECK-NEXT:    vst1.32 {d16}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <2 x i32> <i32 4294967041, i32 4294967041> to <2 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvni32_sti64(i8* %p) {
+; CHECK-LABEL: v_mvni32_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 d16, #0xfe
+; CHECK-NEXT:    vst1.64 {d16}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <2 x i32> <i32 4294967041, i32 4294967041> to <1 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8)
+  ret void
+}
+
+
+define arm_aapcs_vfpcc void @v_mvnQi16_sti16(i8* %p) {
+; CHECK-LABEL: v_mvnQi16_sti16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 q8, #0xfe
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %p, <8 x i16> <i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281>, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvnQi16_stf16(i8* %p) {
+; CHECK-LABEL: v_mvnQi16_stf16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 q8, #0xfe
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i16> <i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281> to <8 x half>
+  call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* %p, <8 x half> %val, i32 2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvnQi16_sti32(i8* %p) {
+; CHECK-LABEL: v_mvnQi16_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 q8, #0xfe
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i16> <i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281> to <4 x i32>
+  call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvnQi16_stf32(i8* %p) {
+; CHECK-LABEL: v_mvnQi16_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 q8, #0xfe
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i16> <i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281> to <4 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvnQi16_sti64(i8* %p) {
+; CHECK-LABEL: v_mvnQi16_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i16 q8, #0xfe
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <8 x i16> <i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281, i16 65281> to <2 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvnQi32_sti32(i8* %p) {
+; CHECK-LABEL: v_mvnQi32_sti32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 q8, #0xfe
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> <i32 4294967041, i32 4294967041, i32 4294967041, i32 4294967041>, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvnQi32_stf32(i8* %p) {
+; CHECK-LABEL: v_mvnQi32_stf32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 q8, #0xfe
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i32> <i32 4294967041, i32 4294967041, i32 4294967041, i32 4294967041> to <4 x float>
+  call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @v_mvnQi32_sti64(i8* %p) {
+; CHECK-LABEL: v_mvnQi32_sti64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmvn.i32 q8, #0xfe
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:64]
+; CHECK-NEXT:    mov pc, lr
+  %val = bitcast <4 x i32> <i32 4294967041, i32 4294967041, i32 4294967041, i32 4294967041> to <2 x i64>
+  call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
 declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f16(i8*, <4 x half>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v1i64(i8*, <1 x i64>, i32) nounwind
+
+declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8f16(i8*, <8 x half>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2i64(i8*, <2 x i64>, i32) nounwind

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
index 5bddc5ed8333..ca51edb92a2a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -1830,8 +1830,7 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(<2 x i64> *%dest, <2
 ; CHECK-BE-NEXT:    vldr d0, [r0]
 ; CHECK-BE-NEXT:    b .LBB49_3
 ; CHECK-BE-NEXT:  .LBB49_2:
-; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
-; CHECK-BE-NEXT:    vrev64.32 q0, q1
+; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-BE-NEXT:  .LBB49_3: @ %else
 ; CHECK-BE-NEXT:    lsls r1, r1, #30
 ; CHECK-BE-NEXT:    it mi
@@ -1924,8 +1923,7 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(<2 x double> *%des
 ; CHECK-BE-NEXT:    vldr d0, [r0]
 ; CHECK-BE-NEXT:    b .LBB50_3
 ; CHECK-BE-NEXT:  .LBB50_2:
-; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
-; CHECK-BE-NEXT:    vrev64.32 q0, q1
+; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-BE-NEXT:  .LBB50_3: @ %else
 ; CHECK-BE-NEXT:    lsls r1, r1, #30
 ; CHECK-BE-NEXT:    it mi

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll
index 2173112d9640..77dd9c5df95b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll
@@ -4,91 +4,55 @@
 ; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKBE
 
 define arm_aapcs_vfpcc <16 x i8> @mov_int8_1() {
-; CHECKLE-LABEL: mov_int8_1:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i8 q0, #0x1
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int8_1:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i8 q1, #0x1
-; CHECKBE-NEXT:    vrev64.8 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int8_1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i8 q0, #0x1
+; CHECK-NEXT:    bx lr
 entry:
   ret <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 }
 
 define arm_aapcs_vfpcc <16 x i8> @mov_int8_m1() {
-; CHECKLE-LABEL: mov_int8_m1:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i8 q0, #0xff
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int8_m1:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i8 q1, #0xff
-; CHECKBE-NEXT:    vrev64.8 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int8_m1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i8 q0, #0xff
+; CHECK-NEXT:    bx lr
 entry:
   ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 }
 
 define arm_aapcs_vfpcc <8 x i16> @mov_int16_1() {
-; CHECKLE-LABEL: mov_int16_1:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i16 q0, #0x1
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int16_1:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i16 q1, #0x1
-; CHECKBE-NEXT:    vrev64.16 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int16_1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i16 q0, #0x1
+; CHECK-NEXT:    bx lr
 entry:
   ret <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
 }
 
 define arm_aapcs_vfpcc <8 x i16> @mov_int16_m1() {
-; CHECKLE-LABEL: mov_int16_m1:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i8 q0, #0xff
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int16_m1:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i8 q1, #0xff
-; CHECKBE-NEXT:    vrev64.8 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int16_m1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i8 q0, #0xff
+; CHECK-NEXT:    bx lr
 entry:
   ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 }
 
 define arm_aapcs_vfpcc <8 x i16> @mov_int16_256() {
-; CHECKLE-LABEL: mov_int16_256:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i16 q0, #0x100
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int16_256:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i16 q1, #0x100
-; CHECKBE-NEXT:    vrev64.16 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i16 q0, #0x100
+; CHECK-NEXT:    bx lr
 entry:
   ret <8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
 }
 
 define arm_aapcs_vfpcc <8 x i16> @mov_int16_257() {
-; CHECKLE-LABEL: mov_int16_257:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i8 q0, #0x1
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int16_257:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i8 q1, #0x1
-; CHECKBE-NEXT:    vrev64.8 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int16_257:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i8 q0, #0x1
+; CHECK-NEXT:    bx lr
 entry:
   ret <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
 }
@@ -125,61 +89,37 @@ entry:
 }
 
 define arm_aapcs_vfpcc <4 x i32> @mov_int32_1() {
-; CHECKLE-LABEL: mov_int32_1:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i32 q0, #0x1
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int32_1:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i32 q1, #0x1
-; CHECKBE-NEXT:    vrev64.32 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int32_1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i32 q0, #0x1
+; CHECK-NEXT:    bx lr
 entry:
   ret <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 }
 
 define arm_aapcs_vfpcc <4 x i32> @mov_int32_256() {
-; CHECKLE-LABEL: mov_int32_256:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i32 q0, #0x100
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int32_256:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i32 q1, #0x100
-; CHECKBE-NEXT:    vrev64.32 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i32 q0, #0x100
+; CHECK-NEXT:    bx lr
 entry:
   ret <4 x i32> <i32 256, i32 256, i32 256, i32 256>
 }
 
 define arm_aapcs_vfpcc <4 x i32> @mov_int32_65536() {
-; CHECKLE-LABEL: mov_int32_65536:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i32 q0, #0x10000
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int32_65536:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i32 q1, #0x10000
-; CHECKBE-NEXT:    vrev64.32 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int32_65536:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i32 q0, #0x10000
+; CHECK-NEXT:    bx lr
 entry:
   ret <4 x i32> <i32 65536, i32 65536, i32 65536, i32 65536>
 }
 
 define arm_aapcs_vfpcc <4 x i32> @mov_int32_16777216() {
-; CHECKLE-LABEL: mov_int32_16777216:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i32 q0, #0x1000000
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int32_16777216:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i32 q1, #0x1000000
-; CHECKBE-NEXT:    vrev64.32 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int32_16777216:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i32 q0, #0x1000000
+; CHECK-NEXT:    bx lr
 entry:
   ret <4 x i32> <i32 16777216, i32 16777216, i32 16777216, i32 16777216>
 }
@@ -216,61 +156,37 @@ entry:
 }
 
 define arm_aapcs_vfpcc <4 x i32> @mov_int32_17919() {
-; CHECKLE-LABEL: mov_int32_17919:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i32 q0, #0x45ff
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int32_17919:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i32 q1, #0x45ff
-; CHECKBE-NEXT:    vrev64.32 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int32_17919:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i32 q0, #0x45ff
+; CHECK-NEXT:    bx lr
 entry:
   ret <4 x i32> <i32 17919, i32 17919, i32 17919, i32 17919>
 }
 
 define arm_aapcs_vfpcc <4 x i32> @mov_int32_4587519() {
-; CHECKLE-LABEL: mov_int32_4587519:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i32 q0, #0x45ffff
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int32_4587519:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i32 q1, #0x45ffff
-; CHECKBE-NEXT:    vrev64.32 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int32_4587519:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i32 q0, #0x45ffff
+; CHECK-NEXT:    bx lr
 entry:
   ret <4 x i32> <i32 4587519, i32 4587519, i32 4587519, i32 4587519>
 }
 
 define arm_aapcs_vfpcc <4 x i32> @mov_int32_m1() {
-; CHECKLE-LABEL: mov_int32_m1:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i8 q0, #0xff
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int32_m1:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i8 q1, #0xff
-; CHECKBE-NEXT:    vrev64.8 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int32_m1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i8 q0, #0xff
+; CHECK-NEXT:    bx lr
 entry:
   ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
 }
 
 define arm_aapcs_vfpcc <4 x i32> @mov_int32_4294901760() {
-; CHECKLE-LABEL: mov_int32_4294901760:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmvn.i32 q0, #0xffff
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int32_4294901760:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmvn.i32 q1, #0xffff
-; CHECKBE-NEXT:    vrev64.32 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int32_4294901760:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmvn.i32 q0, #0xffff
+; CHECK-NEXT:    bx lr
 entry:
   ret <4 x i32> <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
 }
@@ -307,16 +223,10 @@ entry:
 }
 
 define arm_aapcs_vfpcc <4 x i32> @mov_int32_4278255615() {
-; CHECKLE-LABEL: mov_int32_4278255615:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmvn.i32 q0, #0xff0000
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int32_4278255615:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmvn.i32 q1, #0xff0000
-; CHECKBE-NEXT:    vrev64.32 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int32_4278255615:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmvn.i32 q0, #0xff0000
+; CHECK-NEXT:    bx lr
 entry:
   ret <4 x i32> <i32 4278255615, i32 4278255615, i32 4278255615, i32 4278255615>
 }
@@ -367,16 +277,10 @@ entry:
 }
 
 define arm_aapcs_vfpcc <2 x i64> @mov_int64_m1() {
-; CHECKLE-LABEL: mov_int64_m1:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i8 q0, #0xff
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_int64_m1:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i8 q1, #0xff
-; CHECKBE-NEXT:    vrev64.8 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_int64_m1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i8 q0, #0xff
+; CHECK-NEXT:    bx lr
 entry:
   ret <2 x i64> < i64 -1, i64 -1 >
 }
@@ -462,8 +366,7 @@ define arm_aapcs_vfpcc <16 x i8> @mov_int64_0f0f0f0f0f0f0f0f() {
 ;
 ; CHECKBE-LABEL: mov_int64_0f0f0f0f0f0f0f0f:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i16 q1, #0xff00
-; CHECKBE-NEXT:    vrev64.16 q0, q1
+; CHECKBE-NEXT:    vmov.i16 q0, #0xff00
 ; CHECKBE-NEXT:    bx lr
 entry:
   ret <16 x i8> <i8 -1, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 -1, i8 0>
@@ -532,32 +435,20 @@ entry:
 }
 
 define arm_aapcs_vfpcc <8 x half> @mov_float16_1() {
-; CHECKLE-LABEL: mov_float16_1:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i16 q0, #0x3c00
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_float16_1:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i16 q1, #0x3c00
-; CHECKBE-NEXT:    vrev64.16 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_float16_1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i16 q0, #0x3c00
+; CHECK-NEXT:    bx lr
 
 entry:
   ret <8 x half> <half 1.000000e+00, half 1.000000e+00, half 1.000000e+00, half 1.000000e+00, half 1.000000e+00, half 1.000000e+00, half 1.000000e+00, half 1.000000e+00>
 }
 
 define arm_aapcs_vfpcc <8 x half> @mov_float16_m3() {
-; CHECKLE-LABEL: mov_float16_m3:
-; CHECKLE:       @ %bb.0: @ %entry
-; CHECKLE-NEXT:    vmov.i16 q0, #0xc200
-; CHECKLE-NEXT:    bx lr
-;
-; CHECKBE-LABEL: mov_float16_m3:
-; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vmov.i16 q1, #0xc200
-; CHECKBE-NEXT:    vrev64.16 q0, q1
-; CHECKBE-NEXT:    bx lr
+; CHECK-LABEL: mov_float16_m3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.i16 q0, #0xc200
+; CHECK-NEXT:    bx lr
 
 entry:
   ret <8 x half> <half -3.000000e+00, half -3.000000e+00, half -3.000000e+00, half -3.000000e+00, half -3.000000e+00, half -3.000000e+00, half -3.000000e+00, half -3.000000e+00>


        


More information about the llvm-commits mailing list