[llvm] fc69077 - Revert "[ARM] Transforming memcpy to Tail predicated Loop"

Thu May 6 04:42:33 PDT 2021

Author: Malhar Jajoo
Date: 2021-05-06T12:39:08+01:00
New Revision: fc690777fce0bf50a8f424b05993b1e218713ae5

URL: https://github.com/llvm/llvm-project/commit/fc690777fce0bf50a8f424b05993b1e218713ae5
DIFF: https://github.com/llvm/llvm-project/commit/fc690777fce0bf50a8f424b05993b1e218713ae5.diff

LOG: Revert "[ARM] Transforming memcpy to Tail predicated Loop"

Reverting commit since it causes failure (10462).
This reverts commit b856f4a232cbd43476e9b9f75c80aacfc6f5c152.

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/ARMISelLowering.h
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
    llvm/lib/Target/ARM/ARMSubtarget.h
    llvm/lib/Target/ARM/ARMTargetTransformInfo.h
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll

Removed: 
    llvm/test/CodeGen/Thumb2/mve-tp-loop.ll
    llvm/test/CodeGen/Thumb2/mve-tp-loop.mir


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9931d2c357ace..9b4e1bc4046cd 100644

--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1802,7 +1802,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(ARMISD::CSINV)
     MAKE_CASE(ARMISD::CSNEG)
     MAKE_CASE(ARMISD::CSINC)
-    MAKE_CASE(ARMISD::MEMCPYLOOP)
 #undef MAKE_CASE
   }
   return nullptr;
@@ -11098,141 +11097,6 @@ static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
   return true;
 }
 
-/// Adds logic in loop entry MBB to calculate loop iteration count and adds
-/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
-static Register genTPEntry(MachineBasicBlock *TpEntry,
-                           MachineBasicBlock *TpLoopBody,
-                           MachineBasicBlock *TpExit, Register OpSizeReg,
-                           const TargetInstrInfo *TII, DebugLoc Dl,
-                           MachineRegisterInfo &MRI) {
-
-  // Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16.
-  Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
-      .addUse(OpSizeReg)
-      .addImm(15)
-      .add(predOps(ARMCC::AL))
-      .addReg(0);
-
-  Register BicDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2BICri), BicDestReg)
-      .addUse(AddDestReg, RegState::Kill)
-      .addImm(16)
-      .add(predOps(ARMCC::AL))
-      .addReg(0);
-
-  Register LsrDestReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
-      .addUse(BicDestReg, RegState::Kill)
-      .addImm(4)
-      .add(predOps(ARMCC::AL))
-      .addReg(0);
-
-  Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
-      .addUse(LsrDestReg, RegState::Kill);
-
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
-      .addUse(TotalIterationsReg)
-      .addMBB(TpExit);
-
-  return TotalIterationsReg;
-}
-
-/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
-/// t2DoLoopEnd. These are used by later passes to generate tail predicated
-/// loops.
-static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
-                          MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
-                          const TargetInstrInfo *TII, DebugLoc Dl,
-                          MachineRegisterInfo &MRI, Register OpSrcReg,
-                          Register OpDestReg, Register ElementCountReg,
-                          Register TotalIterationsReg) {
-
-  // First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop
-  // iteration counter, predication counter Current position in the src array
-  Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
-      .addUse(OpSrcReg)
-      .addMBB(TpEntry)
-      .addUse(CurrSrcReg)
-      .addMBB(TpLoopBody);
-
-  // Current position in the dest array
-  Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
-      .addUse(OpDestReg)
-      .addMBB(TpEntry)
-      .addUse(CurrDestReg)
-      .addMBB(TpLoopBody);
-
-  // Current loop counter
-  Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
-  Register RemainingLoopIterationsReg =
-      MRI.createVirtualRegister(&ARM::GPRlrRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
-      .addUse(TotalIterationsReg)
-      .addMBB(TpEntry)
-      .addUse(RemainingLoopIterationsReg)
-      .addMBB(TpLoopBody);
-
-  // Predication counter
-  Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
-      .addUse(ElementCountReg)
-      .addMBB(TpEntry)
-      .addUse(RemainingElementsReg)
-      .addMBB(TpLoopBody);
-
-  // Pass predication counter to VCTP
-  Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
-      .addUse(PredCounterPhiReg)
-      .addImm(ARMVCC::None)
-      .addReg(0);
-
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
-      .addUse(PredCounterPhiReg)
-      .addImm(16)
-      .add(predOps(ARMCC::AL))
-      .addReg(0);
-
-  // VLDRB and VSTRB instructions, predicated using VPR
-  Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
-      .addDef(CurrSrcReg)
-      .addDef(LoadedValueReg)
-      .addReg(SrcPhiReg)
-      .addImm(16)
-      .addImm(ARMVCC::Then)
-      .addUse(VccrReg);
-
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
-      .addDef(CurrDestReg)
-      .addUse(LoadedValueReg, RegState::Kill)
-      .addReg(DestPhiReg)
-      .addImm(16)
-      .addImm(ARMVCC::Then)
-      .addUse(VccrReg);
-
-  // Add the pseudoInstrs for decrementing the loop counter and marking the
-  // end:t2DoLoopDec and t2DoLoopEnd
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
-      .addUse(LoopCounterPhiReg)
-      .addImm(1);
-
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
-      .addUse(RemainingLoopIterationsReg)
-      .addMBB(TpLoopBody);
-
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
-      .addMBB(TpExit)
-      .add(predOps(ARMCC::AL));
-}
-
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -11259,91 +11123,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return BB;
   }
 
-  case ARM::MVE_MEMCPYLOOPINST: {
-
-    // Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction
-    // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
-    // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
-    // adds the relevant instructions in the TP loop Body for generation of a
-    // WLSTP loop.
-
-    // Below is relevant portion of the CFG after the transformation.
-    // The Machine Basic Blocks are shown along with branch conditions (in
-    // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
-    // portion of the CFG and may not necessarily be the entry/exit of the
-    // function.
-
-    //             (Relevant) CFG after transformation:
-    //               TP entry MBB
-    //                   |
-    //          |-----------------|
-    //       (n <= 0)          (n > 0)
-    //          |                 |
-    //          |         TP loop Body MBB<--|
-    //          |                |           |
-    //           \               |___________|
-    //            \             /
-    //              TP exit MBB
-
-    MachineFunction *MF = BB->getParent();
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-
-    Register OpDestReg = MI.getOperand(0).getReg();
-    Register OpSrcReg = MI.getOperand(1).getReg();
-    Register OpSizeReg = MI.getOperand(2).getReg();
-
-    // Allocate the required MBBs and add to parent function.
-    MachineBasicBlock *TpEntry = BB;
-    MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
-    MachineBasicBlock *TpExit;
-
-    MF->push_back(TpLoopBody);
-
-    // If any instructions are present in the current block after
-    // MVE_MEMCPYLOOPINST, split the current block and move the instructions
-    // into the newly created exit block. If there are no instructions
-    // add an explicit branch to the FallThrough block and then split.
-    //
-    // The split is required for two reasons:
-    // 1) A terminator(t2WhileLoopStart) will be placed at that site.
-    // 2) Since a TPLoopBody will be added later, any phis in successive blocks
-    //    need to be updated. splitAt() already handles this.
-    TpExit = BB->splitAt(MI, false);
-    if (TpExit == BB) {
-      assert(BB->canFallThrough() &&
-             "Exit block must be FallThrough of the block containing memcpy");
-      TpExit = BB->getFallThrough();
-      BuildMI(BB, dl, TII->get(ARM::t2B))
-          .addMBB(TpExit)
-          .add(predOps(ARMCC::AL));
-      TpExit = BB->splitAt(MI, false);
-    }
-
-    // Add logic for iteration count
-    Register TotalIterationsReg =
-        genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
-
-    // Add the vectorized (and predicated) loads/store instructions
-    genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
-                  OpDestReg, OpSizeReg, TotalIterationsReg);
-
-    // Connect the blocks
-    TpEntry->addSuccessor(TpLoopBody);
-    TpLoopBody->addSuccessor(TpLoopBody);
-    TpLoopBody->addSuccessor(TpExit);
-
-    // Reorder for a more natural layout
-    TpLoopBody->moveAfter(TpEntry);
-    TpExit->moveAfter(TpLoopBody);
-
-    // Finally, remove the memcpy Psuedo Instruction
-    MI.eraseFromParent();
-
-    // Return the exit block as it may contain other instructions requiring a
-    // custom inserter
-    return TpExit;
-  }
-
   // The Thumb2 pre-indexed stores have the same MI operands, they just
   // define them 
diff erently in the .td files from the isel patterns, so
   // they need pseudos.

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index b604fae7bc5a1..41bebeee9c31b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -300,10 +300,6 @@ class VectorType;
     // instructions.
     MEMCPY,
 
-    // Pseudo-instruction representing a memory copy using a tail predicated
-    // loop
-    MEMCPYLOOP,
-
     // V8.1MMainline condition select
     CSINV, // Conditional select invert.
     CSNEG, // Conditional select negate.

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index bf9e52fbdf113..0356f4235c23d 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -6865,18 +6865,6 @@ class MVE_WLSTP<string asm, bits<2> size>
   let isTerminator = 1;
 }
 
-def SDT_MVEMEMCPYLOOPNODE
-    : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
-def MVE_MEMCPYLOOPNODE : SDNode<"ARMISD::MEMCPYLOOP", SDT_MVEMEMCPYLOOPNODE,
-                                [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-
-let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-  def MVE_MEMCPYLOOPINST : PseudoInst<(outs),
-        (ins rGPR:$dst, rGPR:$src, rGPR:$sz),
-        NoItinerary,
-        [(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;
-}
-
 def MVE_DLSTP_8  : MVE_DLSTP<"dlstp.8",  0b00>;
 def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
 def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;

diff  --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index a0c82851b2db1..7e06229b60c3e 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -11,27 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMTargetMachine.h"
-#include "ARMTargetTransformInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-selectiondag-info"
 
-cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
-    "arm-memtransfer-tploop", cl::Hidden,
-    cl::desc("Control conversion of memcpy to "
-             "Tail predicated loops (WLSTP)"),
-    cl::init(TPLoop::ForceDisabled),
-    cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
-                          "Don't convert memcpy to TP loop."),
-               clEnumValN(TPLoop::ForceEnabled, "force-enabled",
-                          "Always convert memcpy to TP loop."),
-               clEnumValN(TPLoop::Allow, "allow",
-                          "Allow (may be subject to certain conditions) "
-                          "conversion of memcpy to TP loop.")));
-
 // Emit, if possible, a specialized version of the given Libcall. Typically this
 // means selecting the appropriately aligned version, but we also convert memset
 // of 0 into memclr.
@@ -145,40 +130,13 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
-  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-
-  auto GenInlineTP = [&](const ARMSubtarget &Subtarget,
-                         const SelectionDAG &DAG) {
-    auto &F = DAG.getMachineFunction().getFunction();
-    if (!EnableMemtransferTPLoop)
-      return false;
-    if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
-      return true;
-    // Do not generate inline TP loop if optimizations is disabled,
-    // or if optimization for size (-Os or -Oz) is on.
-    if (F.hasOptNone() || F.hasOptSize())
-      return false;
-    // If cli option is unset
-    if (!ConstantSize && Alignment >= Align(4))
-      return true;
-    if (ConstantSize &&
-        ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
-        ConstantSize->getZExtValue() <
-            Subtarget.getMaxTPLoopInlineSizeThreshold())
-      return true;
-    return false;
-  };
-
-  if (Subtarget.hasMVEIntegerOps() && GenInlineTP(Subtarget, DAG))
-    return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
-                       DAG.getZExtOrTrunc(Size, dl, MVT::i32));
-
   // Do repeated 4-byte loads and stores. To be improved.
   // This requires 4-byte alignment.
   if (Alignment < Align(4))
     return SDValue();
   // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (!ConstantSize)
     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                   Alignment.value(), RTLIB::MEMCPY);

diff  --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 91c7b7cf20037..44ef8593f6ffa 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -538,11 +538,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
     return 64;
   }
 
-  /// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size
-  /// that still makes it profitable to inline the call as a Tail
-  /// Predicated loop
-  unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; }
-
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);

diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index c44689dae853c..6ad6a16d03052 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -48,11 +48,6 @@ namespace TailPredication {
   };
 }
 
-// For controlling conversion of memcpy into Tail Predicated loop.
-namespace TPLoop {
-enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
-}
-
 class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
   using BaseT = BasicTTIImplBase<ARMTTIImpl>;
   using TTI = TargetTransformInfo;

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
index a489a02e6d029..8a4665a19a16f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -1,39 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
 
 define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) {
 ; CHECK-LABEL: test_memcpy:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB0_5
+; CHECK-NEXT:    blt .LBB0_3
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    lsl.w r12, r3, #2
-; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:    mov r8, r3
+; CHECK-NEXT:    mov r5, r2
+; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    lsls r4, r3, #2
+; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:  .LBB0_2: @ %for.body
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
-; CHECK-NEXT:    adds r4, r1, r7
-; CHECK-NEXT:    adds r5, r0, r7
-; CHECK-NEXT:    mov r6, r3
-; CHECK-NEXT:    wlstp.8 lr, r6, .LBB0_3
-; CHECK-NEXT:    b .LBB0_4
-; CHECK-NEXT:  .LBB0_3: @ %for.body
-; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    add r7, r12
-; CHECK-NEXT:    subs r2, #1
-; CHECK-NEXT:    beq .LBB0_5
-; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_4: @ Parent Loop BB0_2 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.u8 q0, [r4], #16
-; CHECK-NEXT:    vstrb.8 q0, [r5], #16
-; CHECK-NEXT:    letp lr, .LBB0_4
-; CHECK-NEXT:    b .LBB0_3
-; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    adds r0, r7, r6
+; CHECK-NEXT:    add.w r1, r9, r6
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    bl __aeabi_memcpy4
+; CHECK-NEXT:    add r6, r4
+; CHECK-NEXT:    subs r5, #1
+; CHECK-NEXT:    bne .LBB0_2
+; CHECK-NEXT:  .LBB0_3: @ %for.cond.cleanup
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 entry:
   %cmp8 = icmp sgt i32 %n, 0
   br i1 %cmp8, label %for.body, label %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll
deleted file mode 100644
index a87fff9aa92aa..0000000000000
--- a/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll
+++ /dev/null
@@ -1,285 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
-
-; Check that WLSTP loop is not generated for alignment < 4
-; void test1(char* dest, char* src, int n){
-;    memcpy(dest, src, n);
-; }
-
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
-
-define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){
-; CHECK-LABEL: test1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    bl __aeabi_memcpy
-; CHECK-NEXT:    pop {r7, pc}
-entry:
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %X, i8* align 1 %Y, i32 %n, i1 false)
-  ret void
-}
-
-
-; Check that WLSTP loop is generated for alignment >= 4
-; void test2(int* restrict X, int* restrict Y, int n){
-;     memcpy(X, Y, n);
-; }
-
-define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n){
-; CHECK-LABEL: test2:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    wlstp.8 lr, r2, .LBB1_2
-; CHECK-NEXT:  .LBB1_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
-; CHECK-NEXT:    vstrb.8 q0, [r0], #16
-; CHECK-NEXT:    letp lr, .LBB1_1
-; CHECK-NEXT:  .LBB1_2: @ %entry
-; CHECK-NEXT:    pop {r7, pc}
-entry:
-  %0 = bitcast i32* %X to i8*
-  %1 = bitcast i32* %Y to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
-  ret void
-}
-
-
-; Checks that transform handles some arithmetic on the input arguments.
-; void test3(int* restrict X, int* restrict Y, int n)
-; {
-;     memcpy(X+2, Y+3, (n*2)+10);
-; }
-
-define void @test3(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
-; CHECK-LABEL: test3:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    movs r3, #10
-; CHECK-NEXT:    add.w r2, r3, r2, lsl #1
-; CHECK-NEXT:    adds r1, #12
-; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    wlstp.8 lr, r2, .LBB2_2
-; CHECK-NEXT:  .LBB2_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
-; CHECK-NEXT:    vstrb.8 q0, [r0], #16
-; CHECK-NEXT:    letp lr, .LBB2_1
-; CHECK-NEXT:  .LBB2_2: @ %entry
-; CHECK-NEXT:    pop {r7, pc}
-entry:
-  %add.ptr = getelementptr inbounds i32, i32* %X, i32 2
-  %0 = bitcast i32* %add.ptr to i8*
-  %add.ptr1 = getelementptr inbounds i32, i32* %Y, i32 3
-  %1 = bitcast i32* %add.ptr1 to i8*
-  %mul = shl nsw i32 %n, 1
-  %add = add nsw i32 %mul, 10
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 4 %0, i8* nonnull align 4 %1, i32 %add, i1 false)
-  ret void
-}
-
-
-; Checks that transform handles for loops that are implicitly converted to mempcy
-; void test4(int* restrict X, int* restrict Y, int n){
-;     for(int i = 0; i < n; ++i){
-;         X[i] = Y[i];
-;     }
-; }
-
-define void @test4(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
-; CHECK-LABEL: test4:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB3_1: @ %for.body.preheader
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    wlstp.8 lr, r2, .LBB3_3
-; CHECK-NEXT:  .LBB3_2: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
-; CHECK-NEXT:    vstrb.8 q0, [r0], #16
-; CHECK-NEXT:    letp lr, .LBB3_2
-; CHECK-NEXT:  .LBB3_3: @ %for.body.preheader
-; CHECK-NEXT:    pop.w {r7, lr}
-; CHECK-NEXT:    bx lr
-entry:
-  %cmp6 = icmp sgt i32 %n, 0
-  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %X.bits = bitcast i32* %X to i8*
-  %Y.bits = bitcast i32* %Y to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.body.preheader, %entry
-  ret void
-}
-
-; Checks that transform can handle > i32 size inputs
-define void @test5(i8* noalias %X, i8* noalias %Y, i64 %n){
-; CHECK-LABEL: test5:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    wlstp.8 lr, r2, .LBB4_2
-; CHECK-NEXT:  .LBB4_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
-; CHECK-NEXT:    vstrb.8 q0, [r0], #16
-; CHECK-NEXT:    letp lr, .LBB4_1
-; CHECK-NEXT:  .LBB4_2:
-; CHECK-NEXT:    pop {r7, pc}
-    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %X, i8* align 4 %Y, i64 %n, i1 false)
-    ret void
-}
-
-; Checks the transform is applied for constant size inputs below a certain threshold (128 in this case)
-define void @test6(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
-; CHECK-LABEL: test6:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    movs r2, #127
-; CHECK-NEXT:    wlstp.8 lr, r2, .LBB5_2
-; CHECK-NEXT:  .LBB5_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
-; CHECK-NEXT:    vstrb.8 q0, [r0], #16
-; CHECK-NEXT:    letp lr, .LBB5_1
-; CHECK-NEXT:  .LBB5_2: @ %entry
-; CHECK-NEXT:    pop {r7, pc}
-entry:
-  %0 = bitcast i32* %X to i8*
-  %1 = bitcast i32* %Y to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* noundef nonnull align 4 dereferenceable(127) %0, i8* noundef nonnull align 4 dereferenceable(127) %1, i32 127, i1 false)
-  ret void
-}
-
-; Checks the transform is NOT applied for constant size inputs above a certain threshold (128 in this case)
-define void @test7(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
-; CHECK-LABEL: test7:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    movs r2, #128
-; CHECK-NEXT:    bl __aeabi_memcpy4
-; CHECK-NEXT:    pop {r7, pc}
-entry:
-  %0 = bitcast i32* %X to i8*
-  %1 = bitcast i32* %Y to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 128, i1 false)
-  ret void
-}
-
-; Checks the transform is NOT applied for constant size inputs below a certain threshold (64 in this case)
-define void @test8(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
-; CHECK-LABEL: test8:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldm.w r1!, {r2, r3, r4, r12, lr}
-; CHECK-NEXT:    stm.w r0!, {r2, r3, r4, r12, lr}
-; CHECK-NEXT:    ldm.w r1!, {r2, r3, r4, r12, lr}
-; CHECK-NEXT:    stm.w r0!, {r2, r3, r4, r12, lr}
-; CHECK-NEXT:    ldm.w r1, {r2, r3, r4, r12, lr}
-; CHECK-NEXT:    stm.w r0, {r2, r3, r4, r12, lr}
-; CHECK-NEXT:    pop {r4, pc}
-entry:
-  %0 = bitcast i32* %X to i8*
-  %1 = bitcast i32* %Y to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 60, i1 false)
-  ret void
-}
-
-; Checks the transform is NOT applied (regardless of alignment) when optimizations are disabled
-define void @test9(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) #0 {
-; CHECK-LABEL: test9:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    bl __aeabi_memcpy4
-; CHECK-NEXT:    pop {r7, pc}
-entry:
-  %0 = bitcast i32* %X to i8*
-  %1 = bitcast i32* %Y to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
-  ret void
-}
-
-; Checks the transform is NOT applied (regardless of alignment) when optimization for size is on (-Os or -Oz)
-define void @test10(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) #1 {
-; CHECK-LABEL: test10:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    bl __aeabi_memcpy4
-; CHECK-NEXT:    pop {r7, pc}
-entry:
-  %0 = bitcast i32* %X to i8*
-  %1 = bitcast i32* %Y to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
-  ret void
-}
-
-define void @test11(i8* nocapture %x, i8* nocapture %y, i32 %n) {
-; CHECK-LABEL: test11:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    popgt {r4, pc}
-; CHECK-NEXT:  .LBB10_1: @ %prehead
-; CHECK-NEXT:    add.w r3, r2, #15
-; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    bic r3, r3, #16
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    lsr.w lr, r3, #4
-; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    subs.w lr, lr, #0
-; CHECK-NEXT:    beq .LBB10_3
-; CHECK-NEXT:  .LBB10_2: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.8 r3
-; CHECK-NEXT:    subs r3, #16
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u8 q0, [r12], #16
-; CHECK-NEXT:    vstrbt.8 q0, [r4], #16
-; CHECK-NEXT:    subs.w lr, lr, #1
-; CHECK-NEXT:    bne .LBB10_2
-; CHECK-NEXT:    b .LBB10_3
-; CHECK-NEXT:  .LBB10_3: @ %for.body
-; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrb r3, [r0], #1
-; CHECK-NEXT:    subs r2, #2
-; CHECK-NEXT:    strb r3, [r1], #1
-; CHECK-NEXT:    bne .LBB10_3
-; CHECK-NEXT:  @ %bb.4: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, pc}
-entry:
-  %cmp6 = icmp slt i32 %n, 0
-  br i1 %cmp6, label %prehead, label %for.cond.cleanup
-
-prehead:                                          ; preds = %entry
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %x, i8* align 4 %y, i32 %n, i1 false)
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %prehead
-  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %prehead ]
-  %x.addr.08 = phi i8* [ %add.ptr, %for.body ], [ %x, %prehead ]
-  %y.addr.07 = phi i8* [ %add.ptr1, %for.body ], [ %y, %prehead ]
-  %add.ptr = getelementptr inbounds i8, i8* %x.addr.08, i32 1
-  %add.ptr1 = getelementptr inbounds i8, i8* %y.addr.07, i32 1
-  %l = load i8, i8* %x.addr.08, align 1
-  store i8 %l, i8* %y.addr.07, align 1
-  %inc = add nuw nsw i32 %i.09, 2
-  %exitcond.not = icmp eq i32 %inc, %n
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-
-for.cond.cleanup:                                 ; preds = %entry
-  ret void
-}
-
-attributes #0 = { noinline  optnone }
-attributes #1 = { optsize }

diff  --git a/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir b/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir
deleted file mode 100644
index 4dfba16102f0a..0000000000000
--- a/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir
+++ /dev/null
@@ -1,127 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -simplify-mir -run-pass=finalize-isel %s -o - | FileCheck %s
---- |
-  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-  target triple = "arm-arm-none-eabi"
-
-  ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-  declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
-
-  define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
-  entry:
-    %0 = bitcast i32* %X to i8*
-    %1 = bitcast i32* %Y to i8*
-    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
-    ret void
-  }
-
-  define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
-  entry:
-    %cmp6 = icmp sgt i32 %n, 0
-    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-  for.body.preheader:                               ; preds = %entry
-    %X.bits = bitcast i32* %X to i8*
-    %Y.bits = bitcast i32* %Y to i8*
-    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
-    br label %for.cond.cleanup
-
-  for.cond.cleanup:                                 ; preds = %for.body.preheader, %entry
-    ret void
-  }
-
-...
----
-name:            test1
-tracksRegLiveness: true
-body:             |
-  bb.0.entry:
-    liveins: $r0, $r1, $r2
-
-    ; CHECK-LABEL: name: test1
-    ; CHECK: liveins: $r0, $r1, $r2
-    ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
-    ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1
-    ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
-    ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
-    ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
-    ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
-    ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
-    ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr
-    ; CHECK: .1:
-    ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.0, %8, %bb.1
-    ; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %10, %bb.1
-    ; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %12, %bb.1
-    ; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %14, %bb.1
-    ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
-    ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
-    ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
-    ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
-    ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
-    ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
-    ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
-    ; CHECK: .2.entry:
-    ; CHECK: tBX_RET 14 /* CC::al */, $noreg
-    %2:rgpr = COPY $r2
-    %1:rgpr = COPY $r1
-    %0:rgpr = COPY $r0
-    MVE_MEMCPYLOOPINST %0, %1, %2
-    tBX_RET 14 /* CC::al */, $noreg
-
-...
----
-name:            test2
-tracksRegLiveness: true
-body:             |
-  ; CHECK-LABEL: name: test2
-  ; CHECK: bb.0.entry:
-  ; CHECK:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
-  ; CHECK:   liveins: $r0, $r1, $r2
-  ; CHECK:   [[COPY:%[0-9]+]]:rgpr = COPY $r2
-  ; CHECK:   [[COPY1:%[0-9]+]]:rgpr = COPY $r1
-  ; CHECK:   [[COPY2:%[0-9]+]]:rgpr = COPY $r0
-  ; CHECK:   t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
-  ; CHECK:   t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
-  ; CHECK:   t2B %bb.1, 14 /* CC::al */, $noreg
-  ; CHECK: bb.1.for.body.preheader:
-  ; CHECK:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
-  ; CHECK:   t2WhileLoopStart [[t2WhileLoopSetup]], %bb.4, implicit-def $cpsr
-  ; CHECK: bb.3:
-  ; CHECK:   [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.1, %8, %bb.3
-  ; CHECK:   [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %10, %bb.3
-  ; CHECK:   [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %12, %bb.3
-  ; CHECK:   [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %14, %bb.3
-  ; CHECK:   [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
-  ; CHECK:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
-  ; CHECK:   [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
-  ; CHECK:   [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
-  ; CHECK:   t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
-  ; CHECK:   t2B %bb.4, 14 /* CC::al */, $noreg
-  ; CHECK: bb.4.for.body.preheader:
-  ; CHECK:   t2B %bb.2, 14 /* CC::al */, $noreg
-  ; CHECK: bb.2.for.cond.cleanup:
-  ; CHECK:   tBX_RET 14 /* CC::al */, $noreg
-  bb.0.entry:
-    successors: %bb.1(0x50000000), %bb.2(0x30000000)
-    liveins: $r0, $r1, $r2
-
-    %2:rgpr = COPY $r2
-    %1:rgpr = COPY $r1
-    %0:rgpr = COPY $r0
-    t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
-    t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
-    t2B %bb.1, 14 /* CC::al */, $noreg
-
-  bb.1.for.body.preheader:
-    successors: %bb.2(0x80000000)
-
-    MVE_MEMCPYLOOPINST %0, %1, %2
-
-  bb.2.for.cond.cleanup:
-    tBX_RET 14 /* CC::al */, $noreg
-
-...