[llvm] ca78151 - [ARM] Introduce MVEEXT ISel lowering

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 12 23:21:31 PDT 2021


Author: David Green
Date: 2021-07-13T07:21:20+01:00
New Revision: ca78151001d80d0fd1a2a6db4742f5f673572f7c

URL: https://github.com/llvm/llvm-project/commit/ca78151001d80d0fd1a2a6db4742f5f673572f7c
DIFF: https://github.com/llvm/llvm-project/commit/ca78151001d80d0fd1a2a6db4742f5f673572f7c.diff

LOG: [ARM] Introduce MVEEXT ISel lowering

Similar to D91921 (and D104515) this introduces two MVESEXT and MVEZEXT
nodes that larger-than-legal sext and zext are lowered to. These either
get optimized away or end up becoming a series of stack loads/store, in
order to perform the extending whilst keeping the order of the lanes
correct. They are generated from v8i16->v8i32, v16i8->v16i16 and
v16i8->v16i32 extends, potentially with a intermediate extend for the
larger v16i8->v16i32 extend. A number of combines have been added for
obvious cases that come up in tests, notably MVEEXT of shuffles. More
may be needed in the future, but this seems to cover most of the cases
that come up in the tests.

Differential Revision: https://reviews.llvm.org/D105090

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/ARMISelLowering.h
    llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
    llvm/test/CodeGen/Thumb2/mve-sext.ll
    llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
    llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
    llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
    llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9e419a5d1239..2317448ace59 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -452,6 +452,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
   }
+  setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
+  setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+  setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
+  setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
+  setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
   setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
   setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
 }
@@ -1685,6 +1691,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(ARMISD::WIN__DBZCHK)
     MAKE_CASE(ARMISD::PREDICATE_CAST)
     MAKE_CASE(ARMISD::VECTOR_REG_CAST)
+    MAKE_CASE(ARMISD::MVESEXT)
+    MAKE_CASE(ARMISD::MVEZEXT)
     MAKE_CASE(ARMISD::MVETRUNC)
     MAKE_CASE(ARMISD::VCMP)
     MAKE_CASE(ARMISD::VCMPZ)
@@ -9014,6 +9022,39 @@ static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
 }
 
+static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG,
+                                 const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasMVEIntegerOps())
+    return SDValue();
+
+  // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
+
+  EVT ToVT = N->getValueType(0);
+  if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
+    return SDValue();
+  SDValue Op = N->getOperand(0);
+  EVT FromVT = Op.getValueType();
+  if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
+    ExtVT = MVT::v8i16;
+
+  unsigned Opcode =
+      N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT;
+  SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
+  SDValue Ext1 = Ext.getValue(1);
+
+  if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
+    Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
+    Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
+}
+
 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
 /// element has been zero/sign-extended, depending on the isSigned parameter,
 /// from an integer type half its size.
@@ -10141,6 +10182,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
   case ISD::TRUNCATE:      return LowerTruncate(Op.getNode(), DAG, Subtarget);
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:   return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::SET_ROUNDING:  return LowerSET_ROUNDING(Op, DAG);
   case ISD::MUL:           return LowerMUL(Op, DAG);
@@ -10290,6 +10333,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::TRUNCATE:
     Res = LowerTruncate(N, DAG, Subtarget);
     break;
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    Res = LowerVectorExtend(N, DAG, Subtarget);
+    break;
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -16542,10 +16589,8 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
   EVT FromEltVT = FromVT.getVectorElementType();
 
   unsigned NumElements = 0;
-  if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
+  if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
     NumElements = 4;
-  if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
-    NumElements = 8;
   if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
     NumElements = 4;
   if (NumElements == 0 ||
@@ -17358,7 +17403,7 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
   // the trunc to anything better, lower it to a stack store and reload,
   // performing the truncation whilst keeping the lanes in the correct order:
   //   VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
-  if (DCI.isBeforeLegalizeOps())
+  if (!DCI.isAfterLegalizeDAG())
     return SDValue();
 
   SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
@@ -17388,6 +17433,178 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
   return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
 }
 
+// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
+static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N,
+                                                    SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());
+  if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
+    return SDValue();
+
+  EVT FromVT = LD->getMemoryVT();
+  EVT ToVT = N->getValueType(0);
+  if (!ToVT.isVector())
+    return SDValue();
+  assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
+  EVT ToEltVT = ToVT.getVectorElementType();
+  EVT FromEltVT = FromVT.getVectorElementType();
+
+  unsigned NumElements = 0;
+  if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
+    NumElements = 4;
+  if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
+    NumElements = 8;
+  assert(NumElements != 0);
+
+  ISD::LoadExtType NewExtType =
+      N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
+      LD->getExtensionType() != ISD::EXTLOAD &&
+      LD->getExtensionType() != NewExtType)
+    return SDValue();
+
+  LLVMContext &C = *DAG.getContext();
+  SDLoc DL(LD);
+  // Details about the old load
+  SDValue Ch = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  Align Alignment = LD->getOriginalAlign();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
+  EVT NewFromVT = EVT::getVectorVT(
+      C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
+  EVT NewToVT = EVT::getVectorVT(
+      C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
+
+  SmallVector<SDValue, 4> Loads;
+  SmallVector<SDValue, 4> Chains;
+  for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
+    unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
+    SDValue NewPtr =
+        DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
+
+    SDValue NewLoad =
+        DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
+                    LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
+                    Alignment, MMOFlags, AAInfo);
+    Loads.push_back(NewLoad);
+    Chains.push_back(SDValue(NewLoad.getNode(), 1));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
+  return DAG.getMergeValues(Loads, DL);
+}
+
+// Perform combines for MVEEXT. If it has not be optimized to anything better
+// before lowering, it gets converted to stack store and extloads performing the
+// extend whilst still keeping the same lane ordering.
+SDValue ARMTargetLowering::PerformMVEExtCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
+  assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
+
+  EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
+      *DAG.getContext());
+  auto Extend = [&](SDValue V) {
+    SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
+    return N->getOpcode() == ARMISD::MVESEXT
+               ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
+                             DAG.getValueType(ExtVT))
+               : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
+  };
+
+  // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
+  if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
+    SDValue Ext = Extend(N->getOperand(0));
+    return DAG.getMergeValues({Ext, Ext}, DL);
+  }
+
+  // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
+  if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
+    ArrayRef<int> Mask = SVN->getMask();
+    assert(Mask.size() == 2 * VT.getVectorNumElements());
+    assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
+    unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
+    SDValue Op0 = SVN->getOperand(0);
+    SDValue Op1 = SVN->getOperand(1);
+
+    auto CheckInregMask = [&](int Start, int Offset) {
+      for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
+        if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
+          return false;
+      return true;
+    };
+    SDValue V0 = SDValue(N, 0);
+    SDValue V1 = SDValue(N, 1);
+    if (CheckInregMask(0, 0))
+      V0 = Extend(Op0);
+    else if (CheckInregMask(0, 1))
+      V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
+    else if (CheckInregMask(0, Mask.size()))
+      V0 = Extend(Op1);
+    else if (CheckInregMask(0, Mask.size() + 1))
+      V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
+
+    if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
+      V1 = Extend(Op1);
+    else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
+      V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
+    else if (CheckInregMask(VT.getVectorNumElements(), 0))
+      V1 = Extend(Op0);
+    else if (CheckInregMask(VT.getVectorNumElements(), 1))
+      V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
+
+    if (V0.getNode() != N || V1.getNode() != N)
+      return DAG.getMergeValues({V0, V1}, DL);
+  }
+
+  // MVEEXT(load) -> extload, extload
+  if (N->getOperand(0)->getOpcode() == ISD::LOAD)
+    if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG))
+      return L;
+
+  if (!DCI.isAfterLegalizeDAG())
+    return SDValue();
+
+  // Lower to a stack store and reload:
+  //  VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
+  SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  int NumOuts = N->getNumValues();
+  assert((NumOuts == 2 || NumOuts == 4) &&
+         "Expected 2 or 4 outputs to an MVEEXT");
+  EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
+      *DAG.getContext());
+  if (N->getNumOperands() == 4)
+    LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
+
+  MachinePointerInfo MPI =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
+  SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
+                               StackPtr, MPI, Align(4));
+
+  SmallVector<SDValue> Loads;
+  for (int I = 0; I < NumOuts; I++) {
+    SDValue Ptr = DAG.getNode(
+        ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
+        DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
+    MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
+        DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
+    SDValue Load = DAG.getExtLoad(
+        N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
+        VT, Chain, Ptr, MPI, LoadVT, Align(4));
+    Loads.push_back(Load);
+  }
+
+  return DAG.getMergeValues(Loads, DL);
+}
+
 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
@@ -17463,6 +17680,9 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
   case ARMISD::MVETRUNC:
     return PerformMVETruncCombine(N, DCI);
+  case ARMISD::MVESEXT:
+  case ARMISD::MVEZEXT:
+    return PerformMVEExtCombine(N, DCI);
   case ARMISD::VCMP:
     return PerformVCMPCombine(N, DCI, Subtarget);
   case ISD::VECREDUCE_ADD:

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index f91e7854f199..98ea3b06614a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -139,7 +139,9 @@ class VectorType;
     PREDICATE_CAST,  // Predicate cast for MVE i1 types
     VECTOR_REG_CAST, // Reinterpret the current contents of a vector register
 
-    MVETRUNC,     // Legalization aid for truncating two vectors into one.
+    MVESEXT,  // Legalization aids for extending a vector into two/four vectors.
+    MVEZEXT,  //  or truncating two/four vectors into one. Eventually becomes
+    MVETRUNC, //  stack store/load sequence, if not optimized to anything else.
 
     VCMP,  // Vector compare.
     VCMPZ, // Vector compare to zero.
@@ -423,6 +425,7 @@ class VectorType;
     SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index ac9267da9f69..70fa9cd9ce05 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -550,77 +550,71 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    blt .LBB11_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    bic r8, r2, #7
-; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    sub.w r12, r8, #8
+; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    vmov.i16 q1, #0x8
-; CHECK-NEXT:    add.w r1, r4, r12, lsr #3
-; CHECK-NEXT:    adr r4, .LCPI11_0
-; CHECK-NEXT:    vldrw.u32 q0, [r4]
-; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT:    bic r12, r1, #7
+; CHECK-NEXT:    add r1, sp, #8
+; CHECK-NEXT:    sub.w r3, r12, #8
+; CHECK-NEXT:    add.w r8, r5, r3, lsr #3
+; CHECK-NEXT:    adr r5, .LCPI11_0
+; CHECK-NEXT:    vldrw.u32 q0, [r5]
 ; CHECK-NEXT:  .LBB11_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB11_3 Depth 2
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, r8
 ; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r5, [sp] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB11_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vmov.u16 r7, q2[6]
-; CHECK-NEXT:    vmov.u16 r3, q2[4]
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r7
-; CHECK-NEXT:    vmov.u16 r3, q2[7]
-; CHECK-NEXT:    vmov.u16 r7, q2[5]
-; CHECK-NEXT:    vmov.u16 r5, q2[2]
-; CHECK-NEXT:    vmov q4[3], q4[1], r7, r3
-; CHECK-NEXT:    vmov.u16 r6, q2[0]
-; CHECK-NEXT:    vmovlb.s16 q4, q4
-; CHECK-NEXT:    vmov q3[2], q3[0], r6, r5
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    mov r10, r1
+; CHECK-NEXT:    vldrh.s32 q4, [r1, #8]
+; CHECK-NEXT:    vldrh.s32 q3, [r1]
+; CHECK-NEXT:    vadd.i16 q2, q2, q1
 ; CHECK-NEXT:    vshl.i32 q4, q4, #1
-; CHECK-NEXT:    vmov.u16 r5, q2[3]
-; CHECK-NEXT:    vmov.u16 r6, q2[1]
-; CHECK-NEXT:    vadd.i32 q4, q4, r0
-; CHECK-NEXT:    vmov q3[3], q3[1], r6, r5
-; CHECK-NEXT:    vmov r5, r6, d9
-; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    vmov r3, r7, d8
 ; CHECK-NEXT:    vshl.i32 q3, q3, #1
-; CHECK-NEXT:    vadd.i16 q2, q2, q1
+; CHECK-NEXT:    vadd.i32 q4, q4, r0
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vmov r9, r10, d7
-; CHECK-NEXT:    ldrh.w r12, [r5]
-; CHECK-NEXT:    vmov r5, r1, d6
-; CHECK-NEXT:    ldrh.w r11, [r6]
-; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov r1, r2, d9
+; CHECK-NEXT:    vmov r6, r7, d7
+; CHECK-NEXT:    vmov r3, r4, d8
+; CHECK-NEXT:    ldrh.w r11, [r2]
+; CHECK-NEXT:    vmov r2, r9, d6
+; CHECK-NEXT:    ldrh r6, [r6]
 ; CHECK-NEXT:    ldrh r7, [r7]
-; CHECK-NEXT:    ldrh.w r6, [r9]
-; CHECK-NEXT:    ldrh.w r10, [r10]
-; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q3[0], r5
-; CHECK-NEXT:    vmov.16 q3[1], r1
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh.w r9, [r9]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.16 q3[1], r9
 ; CHECK-NEXT:    vmov.16 q3[2], r6
-; CHECK-NEXT:    vmov.16 q3[3], r10
+; CHECK-NEXT:    vmov.16 q3[3], r7
 ; CHECK-NEXT:    vmov.16 q3[4], r3
-; CHECK-NEXT:    vmov.16 q3[5], r7
-; CHECK-NEXT:    vmov.16 q3[6], r12
+; CHECK-NEXT:    vmov.16 q3[5], r4
+; CHECK-NEXT:    vmov.16 q3[6], r1
+; CHECK-NEXT:    mov r1, r10
 ; CHECK-NEXT:    vmov.16 q3[7], r11
-; CHECK-NEXT:    vstrb.8 q3, [r4], #16
+; CHECK-NEXT:    vstrb.8 q3, [r5], #16
 ; CHECK-NEXT:    le lr, .LBB11_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=1
-; CHECK-NEXT:    cmp r8, r2
+; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    cmp r12, r2
 ; CHECK-NEXT:    bne .LBB11_2
 ; CHECK-NEXT:  .LBB11_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -676,172 +670,147 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    .pad #136
+; CHECK-NEXT:    sub sp, #136
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #64] @ 4-byte Spill
 ; CHECK-NEXT:    mov r1, r2
-; CHECK-NEXT:    str r2, [sp, #60] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #68] @ 4-byte Spill
 ; CHECK-NEXT:    blt.w .LBB12_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT:    adr r6, .LCPI12_2
-; CHECK-NEXT:    vldrw.u32 q1, [r6]
-; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    ldr r1, [sp, #68] @ 4-byte Reload
+; CHECK-NEXT:    adr r3, .LCPI12_2
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    bic r1, r1, #7
-; CHECK-NEXT:    str r1, [sp, #52] @ 4-byte Spill
-; CHECK-NEXT:    sub.w r3, r1, #8
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q0, #0x18
-; CHECK-NEXT:    add.w r1, r7, r3, lsr #3
-; CHECK-NEXT:    adr r3, .LCPI12_0
-; CHECK-NEXT:    vldrw.u32 q1, [r3]
-; CHECK-NEXT:    adr r7, .LCPI12_1
-; CHECK-NEXT:    str r1, [sp, #48] @ 4-byte Spill
-; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r7]
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    subs r1, #8
+; CHECK-NEXT:    vstrw.32 q0, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i16 q2, #0x18
+; CHECK-NEXT:    add.w r1, r2, r1, lsr #3
+; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
+; CHECK-NEXT:    adr r1, .LCPI12_0
+; CHECK-NEXT:    adr r2, .LCPI12_1
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #24] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    add r2, sp, #120
+; CHECK-NEXT:    vstrw.32 q0, [sp, #8] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB12_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
-; CHECK-NEXT:    ldr r1, [sp, #48] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT:    add.w r10, sp, #104
 ; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    ldr.w r12, [sp, #56] @ 4-byte Reload
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #24] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #8] @ 16-byte Reload
 ; CHECK-NEXT:  .LBB12_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vmov.u16 r3, q5[6]
-; CHECK-NEXT:    vmov.u16 r5, q5[4]
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r3
-; CHECK-NEXT:    vmov.u16 r3, q5[7]
-; CHECK-NEXT:    vmov.u16 r5, q5[5]
-; CHECK-NEXT:    vmov.u16 r4, q6[2]
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r3
-; CHECK-NEXT:    vmov.u16 r1, q6[0]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov.u16 r6, q5[0]
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r7, r5, d3
-; CHECK-NEXT:    vmov r3, r8, d2
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r4
-; CHECK-NEXT:    vmov.u16 r1, q6[3]
-; CHECK-NEXT:    vmov.u16 r4, q6[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r1
-; CHECK-NEXT:    vmov.u16 r4, q5[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r6, r4
-; CHECK-NEXT:    vmov.u16 r4, q5[3]
-; CHECK-NEXT:    vmov.u16 r6, q5[1]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov q2[3], q2[1], r6, r4
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vshl.i32 q2, q2, #1
-; CHECK-NEXT:    vmov r1, r11, d3
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov.u16 r6, q6[4]
-; CHECK-NEXT:    vadd.i16 q5, q5, q0
-; CHECK-NEXT:    ldrh.w r10, [r5]
-; CHECK-NEXT:    vmov r4, r5, d4
-; CHECK-NEXT:    ldrh r2, [r3]
-; CHECK-NEXT:    ldrh.w r9, [r7]
-; CHECK-NEXT:    vmov.u16 r7, q4[4]
-; CHECK-NEXT:    ldrh.w r8, [r8]
+; CHECK-NEXT:    vstrw.32 q5, [r2]
+; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    vldrh.s32 q0, [r2, #8]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    vldrh.s32 q0, [r2]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vmov r6, r2, d4
 ; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh.w r12, [r4]
+; CHECK-NEXT:    add r4, sp, #88
+; CHECK-NEXT:    ldrh.w r11, [r5]
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh r5, [r6]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    vstrw.32 q6, [r4]
+; CHECK-NEXT:    vldrh.s32 q0, [r4]
+; CHECK-NEXT:    vmov.16 q7[0], r5
+; CHECK-NEXT:    vmov.16 q7[1], r2
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r6, r9, d0
+; CHECK-NEXT:    vmov r2, r5, d1
+; CHECK-NEXT:    vldrh.s32 q0, [r4, #8]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    ldrh r6, [r6]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    vmov.16 q1[0], r6
+; CHECK-NEXT:    ldrh.w r6, [r9]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q1[1], r6
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov r2, r6, d0
+; CHECK-NEXT:    vmov.16 q1[3], r5
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r6, [r6]
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov r2, r5, d1
+; CHECK-NEXT:    vmov.16 q1[5], r6
+; CHECK-NEXT:    mov r6, r10
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vstrw.32 q4, [r10]
+; CHECK-NEXT:    vldrh.s32 q0, [r6]
+; CHECK-NEXT:    vmov.16 q1[6], r2
+; CHECK-NEXT:    vmov.16 q1[7], r5
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, r5, d0
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.16 q3[1], r5
+; CHECK-NEXT:    vmov r2, r5, d5
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i16 q6, q6, q2
+; CHECK-NEXT:    vadd.i16 q5, q5, q2
+; CHECK-NEXT:    vadd.i16 q4, q4, q2
+; CHECK-NEXT:    ldrh.w r9, [r2]
+; CHECK-NEXT:    vmov r2, r4, d1
+; CHECK-NEXT:    vldrh.s32 q0, [r6, #8]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q7[2], r9
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vmov.16 q7[3], r5
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov.16 q7[4], r1
+; CHECK-NEXT:    vmov.16 q7[5], r3
+; CHECK-NEXT:    vmov.16 q7[6], r12
+; CHECK-NEXT:    vmov.16 q7[7], r11
+; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q7[0], r4
-; CHECK-NEXT:    ldrh r4, [r5]
-; CHECK-NEXT:    vmov.16 q7[1], r4
-; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov.16 q3[3], r4
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q1[0], r4
-; CHECK-NEXT:    ldrh r4, [r5]
-; CHECK-NEXT:    vmov.u16 r5, q6[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r6, r5
-; CHECK-NEXT:    vmov.u16 r5, q6[7]
-; CHECK-NEXT:    vmov.u16 r6, q6[5]
-; CHECK-NEXT:    vmov.16 q1[1], r4
-; CHECK-NEXT:    vmov q3[3], q3[1], r6, r5
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    ldrh.w r5, [r11]
-; CHECK-NEXT:    vshl.i32 q3, q3, #1
-; CHECK-NEXT:    vadd.i16 q6, q6, q0
-; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vmov.16 q1[3], r5
-; CHECK-NEXT:    vmov r1, r4, d6
-; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov.16 q3[5], r4
+; CHECK-NEXT:    vmov r2, r4, d1
+; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov r1, r3, d7
-; CHECK-NEXT:    vmov.16 q1[5], r4
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov r1, r4, d5
-; CHECK-NEXT:    ldrh r6, [r1]
-; CHECK-NEXT:    ldrh r1, [r3]
-; CHECK-NEXT:    vmov.u16 r3, q4[2]
-; CHECK-NEXT:    ldrh r5, [r4]
-; CHECK-NEXT:    vmov.u16 r4, q4[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q4[3]
-; CHECK-NEXT:    vmov.u16 r4, q4[1]
-; CHECK-NEXT:    vmov.16 q1[7], r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r4, r3
-; CHECK-NEXT:    vmov.u16 r4, q4[6]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmov.16 q7[2], r6
-; CHECK-NEXT:    vshl.i32 q2, q2, #1
-; CHECK-NEXT:    vmov.16 q7[3], r5
-; CHECK-NEXT:    vadd.i32 q3, q2, r0
-; CHECK-NEXT:    vmov.16 q7[4], r2
-; CHECK-NEXT:    vmov r1, r3, d6
-; CHECK-NEXT:    vmov.16 q7[5], r8
-; CHECK-NEXT:    vmov.16 q7[6], r9
-; CHECK-NEXT:    vmov.16 q7[7], r10
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    ldrh r1, [r3]
-; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov r1, r3, d7
-; CHECK-NEXT:    vmov q3[2], q3[0], r7, r4
-; CHECK-NEXT:    vmov.u16 r4, q4[7]
-; CHECK-NEXT:    vmov.u16 r7, q4[5]
-; CHECK-NEXT:    vadd.i16 q4, q4, q0
-; CHECK-NEXT:    vmov q3[3], q3[1], r7, r4
-; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    vshl.i32 q3, q3, #1
-; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.16 q2[3], r3
-; CHECK-NEXT:    vmov r1, r3, d6
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.16 q2[5], r3
-; CHECK-NEXT:    vmov r1, r3, d7
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.16 q2[7], r3
-; CHECK-NEXT:    vadd.i16 q1, q2, q1
-; CHECK-NEXT:    vadd.i16 q1, q1, q7
-; CHECK-NEXT:    vstrb.8 q1, [r12], #16
+; CHECK-NEXT:    vmov.16 q3[6], r2
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    vmov.16 q3[7], r4
+; CHECK-NEXT:    vadd.i16 q0, q3, q1
+; CHECK-NEXT:    vadd.i16 q0, q0, q7
+; CHECK-NEXT:    vstrb.8 q0, [r7], #16
 ; CHECK-NEXT:    le lr, .LBB12_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT:    ldr r2, [sp, #52] @ 4-byte Reload
-; CHECK-NEXT:    cmp r2, r1
+; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r3, [sp, #68] @ 4-byte Reload
+; CHECK-NEXT:    cmp r1, r3
 ; CHECK-NEXT:    bne.w .LBB12_2
 ; CHECK-NEXT:  .LBB12_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    add sp, #136
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 2d66b4bc2ab6..668abe4c43cd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -335,26 +335,35 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov r1, r2, d1
-; CHECK-NEXT:    vmov r12, r3, d0
+; CHECK-NEXT:    vmov r3, r1, d1
+; CHECK-NEXT:    vmov r12, r2, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r4, r5, d0
-; CHECK-NEXT:    vmov r0, lr, d1
-; CHECK-NEXT:    ldrh r7, [r2]
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh.w r2, [r12]
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
-; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov lr, r0, d1
+; CHECK-NEXT:    ldrh r7, [r1]
+; CHECK-NEXT:    ldrh.w r1, [r12]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r4, [r0]
+; CHECK-NEXT:    vmov r0, r5, d0
 ; CHECK-NEXT:    ldrh.w r6, [lr]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r7
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r6
-; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vmov.16 q0[1], r5
+; CHECK-NEXT:    vmov.16 q0[2], r6
+; CHECK-NEXT:    vmov.16 q0[3], r4
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r7
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
@@ -368,26 +377,35 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(<8 x i16*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov r1, r2, d1
-; CHECK-NEXT:    vmov r12, r3, d0
+; CHECK-NEXT:    vmov r3, r1, d1
+; CHECK-NEXT:    vmov r12, r2, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r4, r5, d0
-; CHECK-NEXT:    vmov r0, lr, d1
-; CHECK-NEXT:    ldrh r7, [r2]
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh.w r2, [r12]
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
-; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov lr, r0, d1
+; CHECK-NEXT:    ldrh r7, [r1]
+; CHECK-NEXT:    ldrh.w r1, [r12]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r4, [r0]
+; CHECK-NEXT:    vmov r0, r5, d0
 ; CHECK-NEXT:    ldrh.w r6, [lr]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r7
-; CHECK-NEXT:    vmovlb.u16 q1, q1
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r6
-; CHECK-NEXT:    vmovlb.u16 q0, q0
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vmov.16 q0[1], r5
+; CHECK-NEXT:    vmov.16 q0[2], r6
+; CHECK-NEXT:    vmov.16 q0[3], r4
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r7
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index 5542cf1f9cd6..44fd3e621969 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -275,88 +275,50 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: ext_add_ashr_trunc_i8i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vmov.u8 r1, q1[14]
-; CHECK-NEXT:    vmov.u8 r2, q1[12]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[15]
-; CHECK-NEXT:    vmov.u8 r2, q1[13]
-; CHECK-NEXT:    vmov.i32 q2, #0xff
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[10]
-; CHECK-NEXT:    vmovlb.s8 q4, q4
-; CHECK-NEXT:    vmov.u8 r2, q1[8]
-; CHECK-NEXT:    vmovlb.s16 q4, q4
-; CHECK-NEXT:    vadd.i32 q3, q4, q3
-; CHECK-NEXT:    vshr.u32 q3, q3, #1
-; CHECK-NEXT:    vstrb.32 q3, [r0, #12]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[11]
-; CHECK-NEXT:    vmov.u8 r2, q1[9]
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[6]
-; CHECK-NEXT:    vmovlb.s8 q4, q4
-; CHECK-NEXT:    vmov.u8 r2, q1[4]
-; CHECK-NEXT:    vmovlb.s16 q4, q4
-; CHECK-NEXT:    vadd.i32 q3, q4, q3
-; CHECK-NEXT:    vshr.u32 q3, q3, #1
-; CHECK-NEXT:    vstrb.32 q3, [r0, #8]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[7]
-; CHECK-NEXT:    vmov.u8 r2, q1[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-NEXT:    vmovlb.s8 q4, q4
-; CHECK-NEXT:    vmov.u8 r2, q1[0]
-; CHECK-NEXT:    vmovlb.s16 q4, q4
-; CHECK-NEXT:    vadd.i32 q3, q4, q3
-; CHECK-NEXT:    vshr.u32 q3, q3, #1
-; CHECK-NEXT:    vstrb.32 q3, [r0, #4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[3]
-; CHECK-NEXT:    vmov.u8 r2, q1[1]
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    vand q1, q3, q2
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov.u8 r2, q0[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
-; CHECK-NEXT:    vmovlb.s8 q0, q2
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .pad #112
+; CHECK-NEXT:    sub sp, #112
+; CHECK-NEXT:    add r1, sp, #16
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vstrw.32 q0, [r4]
+; CHECK-NEXT:    vldrb.u16 q0, [r1, #8]
+; CHECK-NEXT:    add r3, sp, #64
+; CHECK-NEXT:    add r5, sp, #32
+; CHECK-NEXT:    add r0, sp, #80
+; CHECK-NEXT:    vstrw.32 q0, [r3]
+; CHECK-NEXT:    add r2, sp, #48
+; CHECK-NEXT:    vldrb.s16 q0, [r4, #8]
+; CHECK-NEXT:    vstrw.32 q0, [r5]
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    add r1, sp, #96
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.s16 q0, [r4]
+; CHECK-NEXT:    vstrw.32 q0, [r2]
+; CHECK-NEXT:    vldrh.u32 q0, [r3, #8]
+; CHECK-NEXT:    vldrh.s32 q1, [r5, #8]
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
 ; CHECK-NEXT:    vshr.u32 q0, q0, #1
-; CHECK-NEXT:    vstrb.32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vstrb.32 q0, [r1, #12]
+; CHECK-NEXT:    vldrh.u32 q0, [r3]
+; CHECK-NEXT:    vldrh.s32 q1, [r5]
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vshr.u32 q0, q0, #1
+; CHECK-NEXT:    vstrb.32 q0, [r1, #8]
+; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
+; CHECK-NEXT:    vldrh.s32 q1, [r2, #8]
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vshr.u32 q0, q0, #1
+; CHECK-NEXT:    vstrb.32 q0, [r1, #4]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q1, [r2]
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vshr.u32 q0, q0, #1
+; CHECK-NEXT:    vstrb.32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    add sp, #112
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sa = sext <16 x i8> %a to <16 x i32>
   %sb = zext <16 x i8> %b to <16 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-sext.ll b/llvm/test/CodeGen/Thumb2/mve-sext.ll
index da804a528551..b4cef9c9ada4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext.ll
@@ -125,41 +125,13 @@ entry:
 define arm_aapcs_vfpcc <16 x i16> @sext_v16i8_v16i16(<16 x i8> %src) {
 ; CHECK-LABEL: sext_v16i8_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmovlb.s8 q2, q1
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.s16 q0, [r0]
+; CHECK-NEXT:    vldrb.s16 q1, [r0, #8]
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sext <16 x i8> %src to <16 x i16>
@@ -169,21 +141,13 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @sext_v8i16_v8i32(<8 x i16> %src) {
 ; CHECK-LABEL: sext_v8i16_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q2, q1
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sext <8 x i16> %src to <8 x i32>
@@ -193,42 +157,21 @@ entry:
 define arm_aapcs_vfpcc <16 x i32> @sext_v16i8_v16i32(<16 x i8> %src) {
 ; CHECK-LABEL: sext_v16i8_v16i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q4, q1
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    vmovlb.s8 q2, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmovlb.s8 q0, q3
-; CHECK-NEXT:    vmovlb.s16 q3, q0
-; CHECK-NEXT:    vmov q0, q4
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    add r1, sp, #32
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.s16 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrb.s16 q0, [r0, #8]
+; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q0, [r1]
+; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT:    vldrh.s32 q2, [r0]
+; CHECK-NEXT:    vldrh.s32 q3, [r0, #8]
+; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sext <16 x i8> %src to <16 x i32>
@@ -285,41 +228,13 @@ entry:
 define arm_aapcs_vfpcc <16 x i16> @zext_v16i8_v16i16(<16 x i8> %src) {
 ; CHECK-LABEL: zext_v16i8_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmovlb.u8 q2, q1
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.u16 q0, [r0]
+; CHECK-NEXT:    vldrb.u16 q1, [r0, #8]
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext <16 x i8> %src to <16 x i16>
@@ -329,21 +244,13 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @zext_v8i16_v8i32(<8 x i16> %src) {
 ; CHECK-LABEL: zext_v8i16_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmovlb.u16 q2, q1
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmovlb.u16 q1, q1
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext <8 x i16> %src to <8 x i32>
@@ -353,39 +260,21 @@ entry:
 define arm_aapcs_vfpcc <16 x i32> @zext_v16i8_v16i32(<16 x i8> %src) {
 ; CHECK-LABEL: zext_v16i8_v16i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.i32 q3, #0xff
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vand q4, q1, q3
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    vand q2, q2, q3
-; CHECK-NEXT:    vmov q5[2], q5[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmov q0, q4
-; CHECK-NEXT:    vmov q5[3], q5[1], r1, r0
-; CHECK-NEXT:    vand q3, q5, q3
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    add r1, sp, #32
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.u16 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrb.u16 q0, [r0, #8]
+; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrh.u32 q1, [r1, #8]
+; CHECK-NEXT:    vldrh.u32 q2, [r0]
+; CHECK-NEXT:    vldrh.u32 q3, [r0, #8]
+; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext <16 x i8> %src to <16 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
index 715be1d921ca..043f7d9576a3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
@@ -17,23 +17,16 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @sext_i32_0246_swapped(<8 x i16> %src) {
 ; CHECK-LABEL: sext_i32_0246_swapped:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmovlb.s16 q0, q1
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
 ; CHECK-NEXT:    vmov.f32 s1, s2
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    bx lr
 entry:
   %out = sext <8 x i16> %src to <8 x i32>
@@ -55,24 +48,17 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @sext_i32_1357_swapped(<8 x i16> %src) {
 ; CHECK-LABEL: sext_i32_1357_swapped:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov.f32 s0, s5
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.f32 s1, s7
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmov.f32 s2, s9
-; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q2, [r0]
+; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    bx lr
 entry:
   %out = sext <8 x i16> %src to <8 x i32>
@@ -95,40 +81,23 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @sext_i32_02468101214_swapped(<16 x i16> %src) {
 ; CHECK-LABEL: sext_i32_02468101214_swapped:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmovlb.s16 q0, q2
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmovlb.s16 q3, q3
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    add r1, sp, #16
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
+; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
+; CHECK-NEXT:    vldrh.s32 q2, [r1, #8]
 ; CHECK-NEXT:    vmov.f32 s1, s2
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov.f32 s2, s12
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmov.f32 s3, s14
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmovlb.s16 q1, q2
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmovlb.s16 q3, q3
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vldrh.s32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s5, s6
-; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s7, s14
+; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
   %out = sext <16 x i16> %src to <16 x i32>
@@ -151,42 +120,25 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @sext_i32_13579111315_swapped(<16 x i16> %src) {
 ; CHECK-LABEL: sext_i32_13579111315_swapped:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    add r1, sp, #16
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vldrh.s32 q2, [r0]
+; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
+; CHECK-NEXT:    vldrh.s32 q3, [r1]
 ; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
 ; CHECK-NEXT:    vmov.f32 s1, s11
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.f32 s2, s13
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmov.f32 s3, s15
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.f32 s4, s9
-; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    vmov.f32 s5, s11
-; CHECK-NEXT:    vmov.f32 s6, s13
-; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vldrh.s32 q2, [r1, #8]
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vmov.f32 s4, s13
+; CHECK-NEXT:    vmov.f32 s5, s15
+; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
   %out = sext <16 x i16> %src to <16 x i32>
@@ -208,17 +160,16 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @zext_i32_0246_swapped(<8 x i16> %src) {
 ; CHECK-LABEL: zext_i32_0246_swapped:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmovlb.u16 q2, q0
-; CHECK-NEXT:    vmovlb.u16 q0, q1
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
 ; CHECK-NEXT:    vmov.f32 s1, s2
-; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    bx lr
 entry:
   %out = zext <8 x i16> %src to <8 x i32>
@@ -240,18 +191,17 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @zext_i32_1357_swapped(<8 x i16> %src) {
 ; CHECK-LABEL: zext_i32_1357_swapped:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmovlb.u16 q1, q1
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vmovlb.u16 q2, q0
-; CHECK-NEXT:    vmov.f32 s0, s5
-; CHECK-NEXT:    vmov.f32 s1, s7
-; CHECK-NEXT:    vmov.f32 s2, s9
-; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q2, [r0]
+; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    bx lr
 entry:
   %out = zext <8 x i16> %src to <8 x i32>
@@ -274,28 +224,23 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @zext_i32_02468101214_swapped(<16 x i16> %src) {
 ; CHECK-LABEL: zext_i32_02468101214_swapped:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmovlb.u16 q3, q0
-; CHECK-NEXT:    vmovlb.u16 q0, q2
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    add r1, sp, #16
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
+; CHECK-NEXT:    vldrh.u32 q2, [r1, #8]
 ; CHECK-NEXT:    vmov.f32 s1, s2
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmov.f32 s2, s12
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.f32 s3, s14
-; CHECK-NEXT:    vmovlb.u16 q3, q1
-; CHECK-NEXT:    vmovlb.u16 q1, q2
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vldrh.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s5, s6
-; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s7, s14
+; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
   %out = zext <16 x i16> %src to <16 x i32>
@@ -318,30 +263,25 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @zext_i32_13579111315_swapped(<16 x i16> %src) {
 ; CHECK-LABEL: zext_i32_13579111315_swapped:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmovlb.u16 q2, q2
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmovlb.u16 q3, q0
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    add r1, sp, #16
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vldrh.u32 q2, [r0]
+; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
+; CHECK-NEXT:    vldrh.u32 q3, [r1]
 ; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
 ; CHECK-NEXT:    vmov.f32 s1, s11
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov.f32 s2, s13
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmovlb.u16 q2, q2
-; CHECK-NEXT:    vmov.f32 s3, s15
-; CHECK-NEXT:    vmovlb.u16 q3, q1
-; CHECK-NEXT:    vmov.f32 s4, s9
-; CHECK-NEXT:    vmov.f32 s5, s11
-; CHECK-NEXT:    vmov.f32 s6, s13
-; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vldrh.u32 q2, [r1, #8]
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vmov.f32 s4, s13
+; CHECK-NEXT:    vmov.f32 s5, s15
+; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
   %out = zext <16 x i16> %src to <16 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index 3ac9dcb83afe..d49b41ec3152 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -524,48 +524,37 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %b) {
 ; CHECK-LABEL: add_v8i8_v8i32_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vmovlb.u8 q1, q1
-; CHECK-NEXT:    vcmp.i16 eq, q1, zr
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov.i8 q1, #0x0
-; CHECK-NEXT:    vmov.i8 q2, #0xff
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vmovlb.u8 q0, q1
+; CHECK-NEXT:    vcmp.i16 eq, q0, zr
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vmov.i8 q1, #0xff
+; CHECK-NEXT:    vldrh.u32 q2, [r0]
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    vpsel q1, q2, q1
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov.i32 q4, #0xffff
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vandt q2, q3, q4
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmovlb.u16 q0, q3
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vpt.i32 ne, q3, zr
-; CHECK-NEXT:    vaddt.i32 q2, q2, q0
-; CHECK-NEXT:    vaddv.u32 r0, q2
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
+; CHECK-NEXT:    vpt.i32 ne, q2, zr
+; CHECK-NEXT:    vaddt.i32 q1, q1, q0
+; CHECK-NEXT:    vaddv.u32 r0, q1
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i8> %b, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
index 3bf046edfaa0..a6e6a9cc81ba 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
@@ -309,36 +309,23 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: add_v8i8_v8i32_zext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    add r0, sp, #16
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmullb.u16 q2, q3, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmullb.u16 q0, q1, q3
-; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    vstrw.32 q1, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
+; CHECK-NEXT:    vldrh.u32 q1, [r1, #8]
+; CHECK-NEXT:    vldrh.u32 q2, [r1]
+; CHECK-NEXT:    vmul.i32 q0, q1, q0
+; CHECK-NEXT:    vldrh.u32 q1, [r0]
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
 ; CHECK-NEXT:    vaddv.u32 r0, q0
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 31f0aa5e3977..bec212a2ef08 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -489,62 +489,44 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
 ; CHECK-LABEL: add_v8i8_v8i32_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    add r0, sp, #16
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmovlb.u8 q2, q2
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmovlb.u16 q4, q3
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vcmp.i16 eq, q2, zr
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.i8 q2, #0x0
-; CHECK-NEXT:    vmovlb.u16 q5, q3
-; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.u16 r1, q2[0]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.u16 r1, q2[1]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
+; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    vstrw.32 q1, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmovlb.u8 q0, q2
+; CHECK-NEXT:    vmov.i8 q1, #0xff
+; CHECK-NEXT:    vcmp.i16 eq, q0, zr
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vldrh.u32 q2, [r0]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.u16 r3, q0[1]
+; CHECK-NEXT:    vldrh.u32 q3, [r1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q3, q5, q4
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
+; CHECK-NEXT:    vmult.i32 q1, q3, q2
+; CHECK-NEXT:    vldrh.u32 q2, [r0, #8]
+; CHECK-NEXT:    vldrh.u32 q3, [r1, #8]
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmul.i32 q2, q3, q2
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
 ; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
-; CHECK-NEXT:    vmov.u16 r1, q2[4]
-; CHECK-NEXT:    vmullb.u16 q0, q1, q4
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.u16 r1, q2[5]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vpt.i32 ne, q1, zr
-; CHECK-NEXT:    vaddt.i32 q3, q3, q0
-; CHECK-NEXT:    vaddv.u32 r0, q3
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vpt.i32 ne, q3, zr
+; CHECK-NEXT:    vaddt.i32 q1, q1, q2
+; CHECK-NEXT:    vaddv.u32 r0, q1
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i8> %b, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index 8004aad599b9..7e2374f2885f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -712,7 +712,7 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @sext16_02461357_0ext(<16 x i16> %src1, i16 %src2) {
 ; CHECK-LABEL: sext16_02461357_0ext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vdup.16 q2, r0
 ; CHECK-NEXT:    vrev32.16 q1, q0
 ; CHECK-NEXT:    vmullb.s16 q1, q1, q2
 ; CHECK-NEXT:    vmullb.s16 q0, q0, q2
@@ -731,7 +731,7 @@ define arm_aapcs_vfpcc <8 x i32> @sext16_0ext_02461357(<16 x i16> %src1, i16 %sr
 ; CHECK-LABEL: sext16_0ext_02461357:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vrev32.16 q1, q0
-; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vdup.16 q2, r0
 ; CHECK-NEXT:    vmullb.s16 q1, q2, q1
 ; CHECK-NEXT:    vmullb.s16 q0, q2, q0
 ; CHECK-NEXT:    bx lr
@@ -922,7 +922,7 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @zext16_02461357_0ext(<16 x i16> %src1, i16 %src2) {
 ; CHECK-LABEL: zext16_02461357_0ext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vdup.16 q2, r0
 ; CHECK-NEXT:    vrev32.16 q1, q0
 ; CHECK-NEXT:    vmullb.u16 q1, q1, q2
 ; CHECK-NEXT:    vmullb.u16 q0, q0, q2
@@ -941,7 +941,7 @@ define arm_aapcs_vfpcc <8 x i32> @zext16_0ext_02461357(<16 x i16> %src1, i16 %sr
 ; CHECK-LABEL: zext16_0ext_02461357:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vrev32.16 q1, q0
-; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vdup.16 q2, r0
 ; CHECK-NEXT:    vmullb.u16 q1, q2, q1
 ; CHECK-NEXT:    vmullb.u16 q0, q2, q0
 ; CHECK-NEXT:    bx lr
@@ -1132,7 +1132,7 @@ entry:
 define arm_aapcs_vfpcc <16 x i16> @sext8_0246810121413579111315_0ext(<32 x i8> %src1, i8 %src2) {
 ; CHECK-LABEL: sext8_0246810121413579111315_0ext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vdup.16 q2, r0
+; CHECK-NEXT:    vdup.8 q2, r0
 ; CHECK-NEXT:    vrev16.8 q1, q0
 ; CHECK-NEXT:    vmullb.s8 q1, q1, q2
 ; CHECK-NEXT:    vmullb.s8 q0, q0, q2
@@ -1151,7 +1151,7 @@ define arm_aapcs_vfpcc <16 x i16> @sext8_0ext_0246810121413579111315(<32 x i8> %
 ; CHECK-LABEL: sext8_0ext_0246810121413579111315:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vrev16.8 q1, q0
-; CHECK-NEXT:    vdup.16 q2, r0
+; CHECK-NEXT:    vdup.8 q2, r0
 ; CHECK-NEXT:    vmullb.s8 q1, q2, q1
 ; CHECK-NEXT:    vmullb.s8 q0, q2, q0
 ; CHECK-NEXT:    bx lr
@@ -1342,7 +1342,7 @@ entry:
 define arm_aapcs_vfpcc <16 x i16> @zext8_0246810121413579111315_0ext(<32 x i8> %src1, i8 %src2) {
 ; CHECK-LABEL: zext8_0246810121413579111315_0ext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vdup.16 q2, r0
+; CHECK-NEXT:    vdup.8 q2, r0
 ; CHECK-NEXT:    vrev16.8 q1, q0
 ; CHECK-NEXT:    vmullb.u8 q1, q1, q2
 ; CHECK-NEXT:    vmullb.u8 q0, q0, q2
@@ -1361,7 +1361,7 @@ define arm_aapcs_vfpcc <16 x i16> @zext8_0ext_0246810121413579111315(<32 x i8> %
 ; CHECK-LABEL: zext8_0ext_0246810121413579111315:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vrev16.8 q1, q0
-; CHECK-NEXT:    vdup.16 q2, r0
+; CHECK-NEXT:    vdup.8 q2, r0
 ; CHECK-NEXT:    vmullb.u8 q1, q2, q1
 ; CHECK-NEXT:    vmullb.u8 q0, q2, q0
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
index f11d8a9c7aba..60bcc28e21dd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
@@ -121,178 +121,99 @@ entry:
 define arm_aapcs_vfpcc <32 x i8> @vqdmulh_v32i8_b(<32 x i8> %s0, <32 x i8> %s1) {
 ; CHECK-LABEL: vqdmulh_v32i8_b:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q2[14]
-; CHECK-NEXT:    vmov.u8 r1, q2[12]
-; CHECK-NEXT:    vmovlb.s8 q4, q4
-; CHECK-NEXT:    vmov q5[2], q5[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q2[15]
-; CHECK-NEXT:    vmov.u8 r1, q2[13]
-; CHECK-NEXT:    vmovlb.s16 q4, q4
-; CHECK-NEXT:    vmov q5[3], q5[1], r1, r0
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vmovlb.s8 q5, q5
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vmovlb.s16 q5, q5
-; CHECK-NEXT:    vmov.u8 r3, q1[8]
-; CHECK-NEXT:    vmul.i32 q4, q5, q4
-; CHECK-NEXT:    vshr.s32 q5, q4, #7
-; CHECK-NEXT:    vmov.i32 q4, #0x7f
-; CHECK-NEXT:    vmin.s32 q5, q5, q4
-; CHECK-NEXT:    vstrb.32 q5, [r0, #12]
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q2[10]
-; CHECK-NEXT:    vmov.u8 r2, q2[8]
-; CHECK-NEXT:    vmovlb.s8 q5, q5
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q2[11]
-; CHECK-NEXT:    vmov.u8 r2, q2[9]
-; CHECK-NEXT:    vmovlb.s16 q5, q5
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    vmovlb.s8 q6, q6
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q6, q6
-; CHECK-NEXT:    vmul.i32 q5, q6, q5
-; CHECK-NEXT:    vshr.s32 q5, q5, #7
-; CHECK-NEXT:    vmin.s32 q5, q5, q4
-; CHECK-NEXT:    vstrb.32 q5, [r0, #8]
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q2[6]
-; CHECK-NEXT:    vmov.u8 r2, q2[4]
-; CHECK-NEXT:    vmovlb.s8 q5, q5
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q2[7]
-; CHECK-NEXT:    vmov.u8 r2, q2[5]
-; CHECK-NEXT:    vmovlb.s16 q5, q5
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmovlb.s8 q6, q6
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    vmovlb.s16 q6, q6
-; CHECK-NEXT:    vmul.i32 q5, q6, q5
-; CHECK-NEXT:    vshr.s32 q5, q5, #7
-; CHECK-NEXT:    vmin.s32 q5, q5, q4
-; CHECK-NEXT:    vstrb.32 q5, [r0, #4]
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov.u8 r2, q0[1]
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q2[2]
-; CHECK-NEXT:    vmov.u8 r2, q2[0]
-; CHECK-NEXT:    vmovlb.s8 q0, q5
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q2[3]
-; CHECK-NEXT:    vmov.u8 r2, q2[1]
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[14]
-; CHECK-NEXT:    vmovlb.s8 q2, q5
-; CHECK-NEXT:    vmov.u8 r2, q1[12]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmul.i32 q0, q2, q0
-; CHECK-NEXT:    vshr.s32 q0, q0, #7
-; CHECK-NEXT:    vmin.s32 q0, q0, q4
-; CHECK-NEXT:    vstrb.32 q0, [r0]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[15]
-; CHECK-NEXT:    vmov.u8 r2, q1[13]
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q3[14]
-; CHECK-NEXT:    vmov.u8 r2, q3[12]
-; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q3[15]
-; CHECK-NEXT:    vmov.u8 r2, q3[13]
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
-; CHECK-NEXT:    add r1, sp, #16
-; CHECK-NEXT:    vmovlb.s8 q2, q2
-; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmul.i32 q0, q2, q0
-; CHECK-NEXT:    vshr.s32 q0, q0, #7
-; CHECK-NEXT:    vmin.s32 q0, q0, q4
-; CHECK-NEXT:    vstrb.32 q0, [r1, #12]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[11]
-; CHECK-NEXT:    vmov.u8 r3, q1[9]
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q3[10]
-; CHECK-NEXT:    vmov.u8 r3, q3[8]
-; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q3[11]
-; CHECK-NEXT:    vmov.u8 r3, q3[9]
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    vmovlb.s8 q2, q2
-; CHECK-NEXT:    vmov.u8 r3, q1[4]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmul.i32 q0, q2, q0
-; CHECK-NEXT:    vshr.s32 q0, q0, #7
-; CHECK-NEXT:    vmin.s32 q0, q0, q4
-; CHECK-NEXT:    vstrb.32 q0, [r1, #8]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[7]
-; CHECK-NEXT:    vmov.u8 r3, q1[5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q3[6]
-; CHECK-NEXT:    vmov.u8 r3, q3[4]
-; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q3[7]
-; CHECK-NEXT:    vmov.u8 r3, q3[5]
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[2]
-; CHECK-NEXT:    vmovlb.s8 q2, q2
-; CHECK-NEXT:    vmov.u8 r3, q1[0]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmul.i32 q0, q2, q0
-; CHECK-NEXT:    vshr.s32 q0, q0, #7
-; CHECK-NEXT:    vmin.s32 q0, q0, q4
-; CHECK-NEXT:    vstrb.32 q0, [r1, #4]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[3]
-; CHECK-NEXT:    vmov.u8 r3, q1[1]
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q3[2]
-; CHECK-NEXT:    vmov.u8 r3, q3[0]
-; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q3[3]
-; CHECK-NEXT:    vmov.u8 r3, q3[1]
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #224
+; CHECK-NEXT:    sub sp, #224
+; CHECK-NEXT:    mov r3, sp
+; CHECK-NEXT:    add r4, sp, #32
+; CHECK-NEXT:    add r2, sp, #16
+; CHECK-NEXT:    add r1, sp, #48
+; CHECK-NEXT:    vstrw.32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q2, [r4]
+; CHECK-NEXT:    vstrw.32 q1, [r2]
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vldrb.s16 q0, [r3, #8]
+; CHECK-NEXT:    add r7, sp, #64
+; CHECK-NEXT:    add r0, sp, #128
+; CHECK-NEXT:    add r5, sp, #80
+; CHECK-NEXT:    vstrw.32 q0, [r7]
+; CHECK-NEXT:    add r6, sp, #144
+; CHECK-NEXT:    vldrb.s16 q0, [r4, #8]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.s16 q0, [r3]
+; CHECK-NEXT:    add r3, sp, #96
+; CHECK-NEXT:    vstrw.32 q0, [r5]
+; CHECK-NEXT:    vldrb.s16 q0, [r4]
+; CHECK-NEXT:    add r4, sp, #160
+; CHECK-NEXT:    vstrw.32 q0, [r6]
+; CHECK-NEXT:    vldrb.s16 q0, [r2, #8]
+; CHECK-NEXT:    vstrw.32 q0, [r3]
+; CHECK-NEXT:    vldrb.s16 q0, [r1, #8]
+; CHECK-NEXT:    vstrw.32 q0, [r4]
+; CHECK-NEXT:    vldrb.s16 q0, [r2]
+; CHECK-NEXT:    add r2, sp, #112
+; CHECK-NEXT:    vstrw.32 q0, [r2]
+; CHECK-NEXT:    mov r12, r2
+; CHECK-NEXT:    vldrb.s16 q0, [r1]
+; CHECK-NEXT:    add r1, sp, #176
+; CHECK-NEXT:    add r2, sp, #192
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.s32 q0, [r7, #8]
+; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vshr.s32 q0, q0, #7
-; CHECK-NEXT:    vmin.s32 q0, q0, q4
-; CHECK-NEXT:    vstrb.32 q0, [r1]
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vshr.s32 q1, q0, #7
+; CHECK-NEXT:    vmov.i32 q0, #0x7f
+; CHECK-NEXT:    vmin.s32 q1, q1, q0
+; CHECK-NEXT:    vstrb.32 q1, [r2, #12]
+; CHECK-NEXT:    vldrh.s32 q2, [r0]
+; CHECK-NEXT:    vldrh.s32 q1, [r7]
+; CHECK-NEXT:    add r0, sp, #208
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vshr.s32 q1, q1, #7
+; CHECK-NEXT:    vmin.s32 q1, q1, q0
+; CHECK-NEXT:    vstrb.32 q1, [r2, #8]
+; CHECK-NEXT:    vldrh.s32 q1, [r5, #8]
+; CHECK-NEXT:    vldrh.s32 q2, [r6, #8]
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vshr.s32 q1, q1, #7
+; CHECK-NEXT:    vmin.s32 q1, q1, q0
+; CHECK-NEXT:    vstrb.32 q1, [r2, #4]
+; CHECK-NEXT:    vldrh.s32 q1, [r5]
+; CHECK-NEXT:    vldrh.s32 q2, [r6]
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vshr.s32 q1, q1, #7
+; CHECK-NEXT:    vmin.s32 q1, q1, q0
+; CHECK-NEXT:    vstrb.32 q1, [r2]
+; CHECK-NEXT:    vldrh.s32 q1, [r3, #8]
+; CHECK-NEXT:    vldrh.s32 q2, [r4, #8]
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vshr.s32 q1, q1, #7
+; CHECK-NEXT:    vmin.s32 q1, q1, q0
+; CHECK-NEXT:    vstrb.32 q1, [r0, #12]
+; CHECK-NEXT:    vldrh.s32 q1, [r3]
+; CHECK-NEXT:    vldrh.s32 q2, [r4]
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vshr.s32 q1, q1, #7
+; CHECK-NEXT:    vmin.s32 q1, q1, q0
+; CHECK-NEXT:    vstrb.32 q1, [r0, #8]
+; CHECK-NEXT:    vldrh.s32 q1, [r3, #8]
+; CHECK-NEXT:    vldrh.s32 q2, [r1, #8]
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vshr.s32 q1, q1, #7
+; CHECK-NEXT:    vmin.s32 q1, q1, q0
+; CHECK-NEXT:    vstrb.32 q1, [r0, #4]
+; CHECK-NEXT:    vldrh.s32 q1, [r3]
+; CHECK-NEXT:    vldrh.s32 q2, [r1]
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vshr.s32 q1, q1, #7
+; CHECK-NEXT:    vmin.s32 q0, q1, q0
+; CHECK-NEXT:    vstrb.32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    add sp, #224
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %l2 = sext <32 x i8> %s0 to <32 x i32>
   %l5 = sext <32 x i8> %s1 to <32 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll
index d8a45ea1e8ef..1eb9e8bd7c8d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll
@@ -349,43 +349,47 @@ entry:
 
 
 define void @foo_int32_int8_both(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
-; CHECK-LABEL: foo_int32_int8_both:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.s16 q1, [r1, #8]
-; CHECK-NEXT:    vmov.u16 r2, q1[6]
-; CHECK-NEXT:    vmov.u16 r3, q1[4]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.u16 r3, q1[5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmovlb.u16 q2, q0
-; CHECK-NEXT:    vldrb.s16 q0, [r1]
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmovlb.u16 q1, q2
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vstrw.32 q1, [r0, #32]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmovlb.u16 q1, q1
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
-; CHECK-NEXT:    vmovlb.u16 q0, q1
-; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: foo_int32_int8_both:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .pad #32
+; CHECK-LE-NEXT:    sub sp, #32
+; CHECK-LE-NEXT:    vldrb.s16 q0, [r1, #8]
+; CHECK-LE-NEXT:    add r2, sp, #16
+; CHECK-LE-NEXT:    vstrw.32 q0, [r2]
+; CHECK-LE-NEXT:    vldrb.s16 q0, [r1]
+; CHECK-LE-NEXT:    mov r1, sp
+; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LE-NEXT:    vldrh.u32 q0, [r2, #8]
+; CHECK-LE-NEXT:    vstrw.32 q0, [r0, #48]
+; CHECK-LE-NEXT:    vldrh.u32 q0, [r2]
+; CHECK-LE-NEXT:    vstrw.32 q0, [r0, #32]
+; CHECK-LE-NEXT:    vldrh.u32 q0, [r1, #8]
+; CHECK-LE-NEXT:    vstrw.32 q0, [r0, #16]
+; CHECK-LE-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-LE-NEXT:    vstrw.32 q0, [r0]
+; CHECK-LE-NEXT:    add sp, #32
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: foo_int32_int8_both:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .pad #32
+; CHECK-BE-NEXT:    sub sp, #32
+; CHECK-BE-NEXT:    vldrb.s16 q0, [r1, #8]
+; CHECK-BE-NEXT:    add r2, sp, #16
+; CHECK-BE-NEXT:    vstrh.16 q0, [r2]
+; CHECK-BE-NEXT:    vldrb.s16 q0, [r1]
+; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
+; CHECK-BE-NEXT:    vldrh.u32 q0, [r2, #8]
+; CHECK-BE-NEXT:    vstrw.32 q0, [r0, #48]
+; CHECK-BE-NEXT:    vldrh.u32 q0, [r2]
+; CHECK-BE-NEXT:    vstrw.32 q0, [r0, #32]
+; CHECK-BE-NEXT:    vldrh.u32 q0, [r1, #8]
+; CHECK-BE-NEXT:    vstrw.32 q0, [r0, #16]
+; CHECK-BE-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-BE-NEXT:    vstrw.32 q0, [r0]
+; CHECK-BE-NEXT:    add sp, #32
+; CHECK-BE-NEXT:    bx lr
 entry:
   %wide.load = load <16 x i8>, <16 x i8>* %src, align 1
   %0 = sext <16 x i8> %wide.load to <16 x i16>
@@ -416,12 +420,12 @@ define <16 x i16>* @foo_uint32_uint16_quad_offset(<16 x i32>* %dest, <16 x i16>*
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r1, #32]!
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT:    vldrh.s32 q2, [r1, #16]
-; CHECK-NEXT:    vldrh.s32 q3, [r1, #24]
+; CHECK-NEXT:    vldrh.s32 q2, [r1, #24]
+; CHECK-NEXT:    vldrh.s32 q3, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q2, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
-; CHECK-NEXT:    vstrw.32 q3, [r0, #48]
+; CHECK-NEXT:    vstrw.32 q3, [r0, #32]
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bx lr
 entry:


        


More information about the llvm-commits mailing list