[llvm] b2ac968 - [ARM] Alter t2DoLoopStart to define lr

Tue Nov 10 07:58:14 PST 2020

Author: David Green
Date: 2020-11-10T15:57:58Z
New Revision: b2ac9681a700c23508d44825d98467bcecaf2e92

URL: https://github.com/llvm/llvm-project/commit/b2ac9681a700c23508d44825d98467bcecaf2e92
DIFF: https://github.com/llvm/llvm-project/commit/b2ac9681a700c23508d44825d98467bcecaf2e92.diff

LOG: [ARM] Alter t2DoLoopStart to define lr

This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR

This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.

This is a fairly simple change in itself, but leads to a number of other
required alterations.

 - The hardware loop pass, if UsePhi is set, now generates loops of the
   form:
       %start = llvm.start.loop.iterations(%N)
     loop:
       %p = phi [%start], [%dec]
       %dec = llvm.loop.decrement.reg(%p, 1)
       %c = icmp ne %dec, 0
       br %c, loop, exit
 - For this a new llvm.start.loop.iterations intrinsic was added, identical
   to llvm.set.loop.iterations but produces a value as seen above, gluing
   the loop together more through def-use chains.
 - This new instrinsic conceptually produces the same output as input,
   which is taught to SCEV so that the checks in MVETailPredication are not
   affected.
 - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
   been left mostly as before. We should now more reliably be able to tell
   that the t2DoLoopStart is correct without having to prove it, but
   t2WhileLoopStart and tail-predicated loops will remain the same.
 - And all the tests have been updated. There are a lot of them!

This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.

Differential Revision: https://reviews.llvm.org/D89881

Added: 
    

Modified: 
    llvm/docs/LangRef.rst
    llvm/include/llvm/IR/Intrinsics.td
    llvm/lib/Analysis/ScalarEvolution.cpp
    llvm/lib/CodeGen/HardwareLoops.cpp
    llvm/lib/Target/ARM/ARMInstrThumb2.td
    llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
    llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
    llvm/lib/Target/ARM/MVETailPredication.cpp
    llvm/test/CodeGen/ARM/machine-outliner-unoutlinable.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
    llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
    llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
    llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
    llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
    llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
    llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
    llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
    llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
    llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
    llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
    llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
    llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
    llvm/test/CodeGen/Thumb2/mve-vldst4.ll
    llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
    llvm/test/Transforms/HardwareLoops/ARM/calls-codegen.ll
    llvm/test/Transforms/HardwareLoops/ARM/calls.ll
    llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
    llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll
    llvm/test/Transforms/HardwareLoops/ARM/structure.ll
    llvm/test/Transforms/HardwareLoops/loop-guards.ll
    llvm/test/Transforms/HardwareLoops/scalar-while.ll

Removed: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-liveout.mir


################################################################################
diff  --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index c9248b49114b7..0d30ca4b347e2 100644

--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15502,6 +15502,45 @@ on their operand. It's a hint to the backend that can use this to set up the
 hardware-loop count with a target specific instruction, usually a move of this
 value to a special register or a hardware-loop instruction.
 
+
+'``llvm.start.loop.iterations.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.start.loop.iterations.i32(i32)
+      declare i64 @llvm.start.loop.iterations.i64(i64)
+
+Overview:
+"""""""""
+
+The '``llvm.start.loop.iterations.*``' intrinsics are similar to the
+'``llvm.set.loop.iterations.*``' intrinsics, used to specify the
+hardware-loop trip count but also produce a value identical to the input
+that can be used as the input to the loop. They are placed in the loop
+preheader basic block and the output is expected to be the input to the
+phi for the induction variable of the loop, decremented by the
+'``llvm.loop.decrement.reg.*``'.
+
+Arguments:
+""""""""""
+
+The integer operand is the loop trip count of the hardware-loop, and thus
+not e.g. the loop back-edge taken count.
+
+Semantics:
+""""""""""
+
+The '``llvm.start.loop.iterations.*``' intrinsics do not perform any arithmetic
+on their operand. It's a hint to the backend that can use this to set up the
+hardware-loop count with a target specific instruction, usually a move of this
+value to a special register or a hardware-loop instruction.
+
 '``llvm.test.set.loop.iterations.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4ee40856dcf22..8ea27402decc3 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1576,6 +1576,11 @@ def int_matrix_column_major_store
 def int_set_loop_iterations :
   DefaultAttrsIntrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
 
+// Same as the above, but produces a value (the same as the input operand) to
+// be fed into the loop.
+def int_start_loop_iterations :
+  DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoDuplicate]>;
+
 // Specify that the value given is the number of iterations that the next loop
 // will execute. Also test that the given count is not zero, allowing it to
 // control entry to a 'while' loop.

diff  --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 4a0374f88d9d6..42d6a10518a3f 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6672,6 +6672,10 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         const SCEV *ClampedX = getUMinExpr(X, getNotSCEV(Y));
         return getAddExpr(ClampedX, Y, SCEV::FlagNUW);
       }
+      case Intrinsic::start_loop_iterations:
+        // A start_loop_iterations is just equivalent to the first operand for
+        // SCEV purposes.
+        return getSCEV(II->getArgOperand(0));
       default:
         break;
       }

diff  --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index af0342ff6681a..76c9c1470868c 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -165,7 +165,7 @@ namespace {
     Value *InitLoopCount();
 
     // Insert the set_loop_iteration intrinsic.
-    void InsertIterationSetup(Value *LoopCountInit);
+    Value *InsertIterationSetup(Value *LoopCountInit);
 
     // Insert the loop_decrement intrinsic.
     void InsertLoopDec();
@@ -325,11 +325,11 @@ void HardwareLoop::Create() {
     return;
   }
 
-  InsertIterationSetup(LoopCountInit);
+  Value *Setup = InsertIterationSetup(LoopCountInit);
 
   if (UsePHICounter || ForceHardwareLoopPHI) {
     Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
-    Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec);
+    Value *EltsRem = InsertPHICounter(Setup, LoopDec);
     LoopDec->setOperand(0, EltsRem);
     UpdateBranch(LoopDec);
   } else
@@ -437,11 +437,13 @@ Value *HardwareLoop::InitLoopCount() {
   return Count;
 }
 
-void HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
+Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
   IRBuilder<> Builder(BeginBB->getTerminator());
   Type *Ty = LoopCountInit->getType();
-  Intrinsic::ID ID = UseLoopGuard ?
-    Intrinsic::test_set_loop_iterations : Intrinsic::set_loop_iterations;
+  bool UsePhi = UsePHICounter || ForceHardwareLoopPHI;
+  Intrinsic::ID ID = UseLoopGuard ? Intrinsic::test_set_loop_iterations
+                                  : (UsePhi ? Intrinsic::start_loop_iterations
+                                           : Intrinsic::set_loop_iterations);
   Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
   Value *SetCount = Builder.CreateCall(LoopIter, LoopCountInit);
 
@@ -457,6 +459,7 @@ void HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
   }
   LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: "
              << *SetCount << "\n");
+  return UseLoopGuard ? LoopCountInit : SetCount;
 }
 
 void HardwareLoop::InsertLoopDec() {

diff  --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 74627b0c1cdcf..340078fcb92b9 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5420,9 +5420,11 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
   let isTerminator = 1;
 }
 
+let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in {
+
 def t2DoLoopStart :
-  t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br,
-  [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>;
+  t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
+  [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;
 
 let hasSideEffects = 0 in
 def t2LoopDec :
@@ -5444,6 +5446,8 @@ def t2LoopEnd :
 
 } // end isBranch, isTerminator, hasSideEffects
 
+}
+
 } // end isNotDuplicable
 
 class CS<string iname, bits<4> opcode, list<dag> pattern=[]>

diff  --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 0ec47bade34b5..76584fca106e3 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -429,7 +429,10 @@ namespace {
     // Return the operand for the loop start instruction. This will be the loop
     // iteration count, or the number of elements if we're tail predicating.
     MachineOperand &getLoopStartOperand() {
-      return IsTailPredicationLegal() ? TPNumElements : Start->getOperand(0);
+      if (IsTailPredicationLegal())
+        return TPNumElements;
+      return Start->getOpcode() == ARM::t2DoLoopStart ? Start->getOperand(1)
+                                                      : Start->getOperand(0);
     }
 
     unsigned getStartOpcode() const {
@@ -495,6 +498,7 @@ namespace {
     bool RevertNonLoops();
 
     void RevertWhile(MachineInstr *MI) const;
+    void RevertDo(MachineInstr *MI) const;
 
     bool RevertLoopDec(MachineInstr *MI) const;
 
@@ -618,8 +622,12 @@ bool LowOverheadLoop::ValidateTailPredicate() {
   // count instead of iteration count, won't affect any other instructions
   // than the LoopStart and LoopDec.
   // TODO: We should try to insert the [W|D]LSTP after any of the other uses.
-  if (StartInsertPt == Start && Start->getOperand(0).getReg() == ARM::LR) {
-    if (auto *IterCount = RDA.getMIOperand(Start, 0)) {
+  Register StartReg = Start->getOpcode() == ARM::t2DoLoopStart
+                          ? Start->getOperand(1).getReg()
+                          : Start->getOperand(0).getReg();
+  if (StartInsertPt == Start && StartReg == ARM::LR) {
+    if (auto *IterCount = RDA.getMIOperand(
+            Start, Start->getOpcode() == ARM::t2DoLoopStart ? 1 : 0)) {
       SmallPtrSet<MachineInstr *, 2> Uses;
       RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses);
       for (auto *Use : Uses) {
@@ -1053,53 +1061,15 @@ void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
                                     MachineBasicBlock *&InsertBB,
                                     ReachingDefAnalysis &RDA,
                                     InstSet &ToRemove) {
-    // We can define LR because LR already contains the same value.
-    if (Start->getOperand(0).getReg() == ARM::LR) {
+    // For a t2DoLoopStart it is always valid to use the start insertion point.
+    // For WLS we can define LR if LR already contains the same value.
+    if (Start->getOpcode() == ARM::t2DoLoopStart ||
+        Start->getOperand(0).getReg() == ARM::LR) {
       InsertPt = MachineBasicBlock::iterator(Start);
       InsertBB = Start->getParent();
       return true;
     }
 
-    Register CountReg = Start->getOperand(0).getReg();
-    auto IsMoveLR = [&CountReg](MachineInstr *MI) {
-      return MI->getOpcode() == ARM::tMOVr &&
-             MI->getOperand(0).getReg() == ARM::LR &&
-             MI->getOperand(1).getReg() == CountReg &&
-             MI->getOperand(2).getImm() == ARMCC::AL;
-    };
-
-    // Find an insertion point:
-    // - Is there a (mov lr, Count) before Start? If so, and nothing else
-    //   writes to Count before Start, we can insert at start.
-    if (auto *LRDef =
-            RDA.getUniqueReachingMIDef(Start, MCRegister::from(ARM::LR))) {
-      if (IsMoveLR(LRDef) &&
-          RDA.hasSameReachingDef(Start, LRDef, CountReg.asMCReg())) {
-        SmallPtrSet<MachineInstr *, 2> Ignore = { Dec };
-        if (!TryRemove(LRDef, RDA, ToRemove, Ignore))
-          return false;
-        InsertPt = MachineBasicBlock::iterator(Start);
-        InsertBB = Start->getParent();
-        return true;
-      }
-    }
-
-    // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
-    //   to Count after Start, we can insert at that mov (which will now be
-    //   dead).
-    MachineBasicBlock *MBB = Start->getParent();
-    if (auto *LRDef =
-            RDA.getLocalLiveOutMIDef(MBB, MCRegister::from(ARM::LR))) {
-      if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) {
-        SmallPtrSet<MachineInstr *, 2> Ignore = { Start, Dec };
-        if (!TryRemove(LRDef, RDA, ToRemove, Ignore))
-          return false;
-        InsertPt = MachineBasicBlock::iterator(LRDef);
-        InsertBB = LRDef->getParent();
-        return true;
-      }
-    }
-
     // We've found no suitable LR def and Start doesn't use LR directly. Can we
     // just define LR anyway?
     if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR)))
@@ -1364,6 +1334,16 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
   MI->eraseFromParent();
 }
 
+void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
+  MachineBasicBlock *MBB = MI->getParent();
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::tMOVr))
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1))
+      .add(predOps(ARMCC::AL));
+  MI->eraseFromParent();
+}
+
 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
   MachineBasicBlock *MBB = MI->getParent();
@@ -1432,7 +1412,7 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
 //
 //   $lr = big-itercount-expression
 //   ..
-//   t2DoLoopStart renamable $lr
+//   $lr = t2DoLoopStart renamable $lr
 //   vector.body:
 //     ..
 //     $vpr = MVE_VCTP32 renamable $r3
@@ -1455,7 +1435,8 @@ void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
 
   LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
 
-  MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 0);
+  MachineInstr *Def = RDA->getMIOperand(
+      LoLoop.Start, LoLoop.Start->getOpcode() == ARM::t2DoLoopStart ? 1 : 0);
   if (!Def) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
     return;
@@ -1634,7 +1615,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
     if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
       RevertWhile(LoLoop.Start);
     else
-      LoLoop.Start->eraseFromParent();
+      RevertDo(LoLoop.Start);
     bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec);
     RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
   } else {
@@ -1699,7 +1680,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
       if (Start->getOpcode() == ARM::t2WhileLoopStart)
         RevertWhile(Start);
       else
-        Start->eraseFromParent();
+        RevertDo(Start);
     }
     for (auto *Dec : Decs)
       RevertLoopDec(Dec);

diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index d36e70f82059b..ccb375020c380 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1679,7 +1679,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
       switch (Call->getIntrinsicID()) {
       default:
         break;
-      case Intrinsic::set_loop_iterations:
+      case Intrinsic::start_loop_iterations:
       case Intrinsic::test_set_loop_iterations:
       case Intrinsic::loop_decrement:
       case Intrinsic::loop_decrement_reg:

diff  --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index a721a16705829..25d5fd7e69c6e 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -188,7 +188,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
         continue;
 
       Intrinsic::ID ID = Call->getIntrinsicID();
-      if (ID == Intrinsic::set_loop_iterations ||
+      if (ID == Intrinsic::start_loop_iterations ||
           ID == Intrinsic::test_set_loop_iterations)
         return cast<IntrinsicInst>(&I);
     }

diff  --git a/llvm/test/CodeGen/ARM/machine-outliner-unoutlinable.mir b/llvm/test/CodeGen/ARM/machine-outliner-unoutlinable.mir
index aa3de2ed1fbdf..43c06333017f4 100644
--- a/llvm/test/CodeGen/ARM/machine-outliner-unoutlinable.mir
+++ b/llvm/test/CodeGen/ARM/machine-outliner-unoutlinable.mir
@@ -152,7 +152,7 @@ body:             |
     $q5 = MVE_VDUP32 $r3, 0, $noreg, $q5
     $q4 = MVE_VDUP32 $r4, 0, $noreg, $q4
     $q0 = MVE_VADDf32 $q4, $q5, 0, $noreg, $q0
-    t2DoLoopStart $r4
+    $lr = t2DoLoopStart $r4
     $r0 = MVE_VMOV_from_lane_32 renamable $q0, 1, 14, $noreg
     tBL 14, $noreg, @z
   bb.1:
@@ -160,7 +160,7 @@ body:             |
     $q5 = MVE_VDUP32 $r3, 0, $noreg, $q5
     $q4 = MVE_VDUP32 $r4, 0, $noreg, $q4
     $q0 = MVE_VADDf32 $q4, $q5, 0, $noreg, $q0
-    t2DoLoopStart $r4
+    $lr = t2DoLoopStart $r4
     $r0 = MVE_VMOV_from_lane_32 renamable $q0, 1, 14, $noreg
     tBL 14, $noreg, @z
   bb.2:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll
index 94a1ec9380fb2..6d60759a37e59 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll
@@ -21,7 +21,7 @@
 ; CHECK-END:   b .LBB0_2
 define void @check_loop_dec_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %for.body.preheader
 
 for.body.preheader:
@@ -49,7 +49,7 @@ for.header:
   %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
   %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
   %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
   br label %for.body
 
 for.cond.cleanup:
@@ -64,7 +64,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_ugt_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %for.body.preheader
 
 for.body.preheader:
@@ -92,7 +92,7 @@ for.header:
   %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
   %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
   %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
   br label %for.body
 
 for.cond.cleanup:
@@ -107,7 +107,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_ult_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %for.body.preheader
 
 for.body.preheader:
@@ -135,7 +135,7 @@ for.header:
   %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
   %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
   %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
   br label %for.body
 
 for.cond.cleanup:
@@ -150,7 +150,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_ult_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %for.body.preheader
 
 for.body.preheader:
@@ -179,7 +179,7 @@ for.header:
   %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
   %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
   %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
   br label %for.body
 
 for.cond.cleanup:
@@ -194,7 +194,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_sgt_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %for.body.preheader
 
 for.body.preheader:
@@ -222,7 +222,7 @@ for.header:
   %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
   %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
   %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
   br label %for.body
 
 for.cond.cleanup:
@@ -237,7 +237,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_sge_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %for.body.preheader
 
 for.body.preheader:
@@ -265,7 +265,7 @@ for.header:
   %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
   %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
   %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
   br label %for.body
 
 for.cond.cleanup:
@@ -280,7 +280,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_sge_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %for.body.preheader
 
 for.body.preheader:
@@ -309,7 +309,7 @@ for.header:
   %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
   %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
   %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
   br label %for.body
 
 for.cond.cleanup:
@@ -324,7 +324,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_uge_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %for.body.preheader
 
 for.body.preheader:
@@ -352,7 +352,7 @@ for.header:
   %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
   %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
   %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
   br label %for.body
 
 for.cond.cleanup:
@@ -367,7 +367,7 @@ for.cond.cleanup:
 ; CHECK-MID:   tB %bb.2
 define void @check_loop_dec_uge_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %for.body.preheader
 
 for.body.preheader:
@@ -396,7 +396,7 @@ for.header:
   %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
   %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
   %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-  %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+  %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ]
   br label %for.body
 
 for.cond.cleanup:
@@ -507,6 +507,6 @@ while.end:
   ret void
 }
 
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i1 @llvm.test.set.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
index db7ca7a55a5a5..81e1a3c3c5561 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
@@ -17,17 +17,17 @@ define hidden i32 @_Z4loopPiPjiS0_i(i32* noalias nocapture readonly %s1, i32* no
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], 1
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[VECTOR_BODY75_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.body75.preheader:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[START1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY75:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[START2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP3]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[START2]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[N]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>*
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
@@ -48,7 +48,7 @@ define hidden i32 @_Z4loopPiPjiS0_i(i32* noalias nocapture readonly %s1, i32* no
 ; CHECK-NEXT:    [[LSR_IV3:%.*]] = phi i32* [ [[S2:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP4:%.*]], [[VECTOR_BODY75]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[D]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[VECTOR_BODY75]] ]
 ; CHECK-NEXT:    [[INDEX80:%.*]] = phi i32 [ [[INDEX_NEXT81:%.*]], [[VECTOR_BODY75]] ], [ 0, [[VECTOR_BODY75_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP17:%.*]], [[VECTOR_BODY75]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[START1]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP17:%.*]], [[VECTOR_BODY75]] ]
 ; CHECK-NEXT:    [[LSR_IV68:%.*]] = bitcast i32* [[LSR_IV6]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV35:%.*]] = bitcast i32* [[LSR_IV3]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV2:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
@@ -88,19 +88,19 @@ for.body.lr.ph:                                   ; preds = %entry
   br i1 %tobool, label %vector.body75.preheader, label %vector.ph
 
 vector.body75.preheader:                          ; preds = %for.body.lr.ph
-  call void @llvm.set.loop.iterations.i32(i32 %2)
+  %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %2)
   br label %vector.body75
 
 vector.ph:                                        ; preds = %for.body.lr.ph
   %broadcast.splatinsert71 = insertelement <4 x i32> undef, i32 %x, i32 0
   %broadcast.splat72 = shufflevector <4 x i32> %broadcast.splatinsert71, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %3)
+  %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %3)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %lsr.iv9 = phi i32* [ %scevgep10, %vector.body ], [ %d, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %4 = phi i32 [ %3, %vector.ph ], [ %8, %vector.body ]
+  %4 = phi i32 [ %start2, %vector.ph ], [ %8, %vector.body ]
   %lsr.iv911 = bitcast i32* %lsr.iv9 to <4 x i32>*
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -120,7 +120,7 @@ vector.body75:                                    ; preds = %vector.body75, %vec
   %lsr.iv3 = phi i32* [ %s2, %vector.body75.preheader ], [ %scevgep4, %vector.body75 ]
   %lsr.iv = phi i32* [ %d, %vector.body75.preheader ], [ %scevgep, %vector.body75 ]
   %index80 = phi i32 [ %index.next81, %vector.body75 ], [ 0, %vector.body75.preheader ]
-  %10 = phi i32 [ %2, %vector.body75.preheader ], [ %15, %vector.body75 ]
+  %10 = phi i32 [ %start1, %vector.body75.preheader ], [ %15, %vector.body75 ]
   %lsr.iv68 = bitcast i32* %lsr.iv6 to <4 x i32>*
   %lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>*
   %lsr.iv2 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -148,7 +148,7 @@ for.cond.cleanup:                                 ; preds = %vector.body, %vecto
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir
index add2b49a9e954..c56a8a239ddc5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir
@@ -79,7 +79,7 @@ body:             |
     $r4 = t2MOVTi16 killed $r4, target-flags(arm-hi16) @arm_cmplx_conj_f32_mve.cmplx_conj_sign, 14 /* CC::al */, $noreg
     renamable $q0 = nnan ninf nsz MVE_VLDRWU32 killed renamable $r4, 0, 0, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.1 (align 4):
     successors: %bb.1(0x7c000000), %bb.2(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir
index ca8c6d8a2649f..c65dc8e3af458 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir
@@ -9,13 +9,13 @@
   entry:
     %scevgep = getelementptr i32, i32* %q, i32 -1
     %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
     br label %while.body
 
   while.body:                                       ; preds = %while.body, %entry
     %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
     %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
-    %0 = phi i32 [ %n, %entry ], [ %2, %while.body ]
+    %0 = phi i32 [ %start, %entry ], [ %2, %while.body ]
     %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
     %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
     %1 = load i32, i32* %scevgep6, align 4
@@ -30,7 +30,7 @@
     ret i32 0
   }
 
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
 
   attributes #0 = { noduplicate nounwind }
@@ -112,7 +112,7 @@ body:             |
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
     $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart killed $r0
+    $lr = t2DoLoopStart killed $r0
     renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
     renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index 29ecf00c556f0..dcfff5f5b6266 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -15,9 +15,9 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    bic r12, r12, #3
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #2
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    and r4, r12, #15
@@ -107,9 +107,9 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a,
 ; CHECK-NEXT:    bic r4, r4, #3
 ; CHECK-NEXT:    sub.w lr, r4, #4
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
+; CHECK-NEXT:    add.w r4, r4, lr, lsr #2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    and r5, r4, #15
@@ -210,9 +210,9 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
 ; CHECK-NEXT:    bic r4, r4, #3
 ; CHECK-NEXT:    sub.w lr, r4, #4
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
+; CHECK-NEXT:    add.w r4, r4, lr, lsr #2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r12
@@ -309,9 +309,9 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
 ; CHECK-NEXT:    bic r4, r4, #3
 ; CHECK-NEXT:    sub.w lr, r4, #4
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
+; CHECK-NEXT:    add.w r4, r4, lr, lsr #2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r12
@@ -402,8 +402,8 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %bb3
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB4_2: @ %bb9
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #4
@@ -464,8 +464,8 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %bb4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:  .LBB5_2: @ %bb12
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir
index f3c86e3ac6465..a85b5446fd5df 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir
@@ -10,11 +10,11 @@
     br i1 %cmp, label %exit, label %loop.ph
 
   loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
     br label %loop.body
 
   loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
     %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
     %addr.a = phi <8 x i16>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
     %addr.b = phi <8 x i16>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@@ -46,11 +46,11 @@
     br i1 %cmp, label %exit, label %loop.ph
 
   loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
     br label %loop.body
 
   loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
     %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
     %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
     %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@@ -82,11 +82,11 @@
     br i1 %cmp, label %exit, label %loop.ph
 
   loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
     br label %loop.body
 
   loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
     %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
     %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
     %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@@ -115,7 +115,7 @@
   declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1 immarg)
   declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1 immarg)
   declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1 immarg)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@@ -166,23 +166,23 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
   ; CHECK:   tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
   ; CHECK:   t2IT 11, 8, implicit-def $itstate
-  ; CHECK:   tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
   ; CHECK: bb.1.loop.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-  ; CHECK:   dead $lr = t2DLS renamable $r12
-  ; CHECK:   $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $r4 = tMOVr killed $lr, 14 /* CC::al */, $noreg
   ; CHECK: bb.2.loop.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4
+  ; CHECK:   $lr = tMOVr $r4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg
   ; CHECK:   MVE_VPST 4, implicit $vpr
   ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2)
   ; CHECK:   renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 2)
-  ; CHECK:   $lr = tMOVr $r4, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = MVE_VCLZs8 killed renamable $q1, 0, $noreg, undef renamable $q1
   ; CHECK:   $r0 = tMOVr $r1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg
@@ -190,7 +190,7 @@ body:             |
   ; CHECK:   renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 2)
   ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
   ; CHECK: bb.3.exit:
-  ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
   bb.0.entry:
     successors: %bb.1(0x80000000)
     liveins: $r0, $r1, $r2, $r3, $r4, $lr
@@ -201,27 +201,27 @@ body:             |
     frame-setup CFI_INSTRUCTION offset $r4, -8
     tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
     t2IT 11, 8, implicit-def $itstate
-    tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
 
   bb.1.loop.ph:
     successors: %bb.2(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3
 
-    renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
-    $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg
+    renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+    renamable $lr = t2DoLoopStart killed renamable $lr
+    $r4 = tMOVr killed $lr, 14 /* CC::al */, $noreg
 
   bb.2.loop.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
     liveins: $r0, $r1, $r2, $r3, $r4
 
+    $lr = tMOVr $r4, 14 /* CC::al */, $noreg
     renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg
     MVE_VPST 4, implicit $vpr
     renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2)
     renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 2)
-    $lr = tMOVr $r4, 14 /* CC::al */, $noreg
-    renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg
     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg
+    renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg
     renamable $q1 = MVE_VCLZs8 killed renamable $q1, 0, $noreg, undef renamable $q1
     renamable $lr = t2LoopDec killed renamable $lr, 1
     $r0 = tMOVr $r1, 14 /* CC::al */, $noreg
@@ -232,7 +232,7 @@ body:             |
     tB %bb.3, 14 /* CC::al */, $noreg
 
   bb.3.exit:
-    tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
 
 ...
 ---
@@ -267,68 +267,69 @@ body:             |
   ; CHECK-LABEL: name: test_ctlz_i16
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r4
-  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
-  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
   ; CHECK:   tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
   ; CHECK:   t2IT 11, 8, implicit-def $itstate
-  ; CHECK:   tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate
   ; CHECK: bb.1.loop.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-  ; CHECK:   dead $lr = t2DLS renamable $r4
-  ; CHECK:   $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4
+  ; CHECK:   renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg
   ; CHECK: bb.2.loop.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r12
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r12
   ; CHECK:   $lr = tMOVr $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
   ; CHECK:   MVE_VPST 4, implicit $vpr
   ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4)
   ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4)
-  ; CHECK:   renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $q1 = MVE_VCLZs16 killed renamable $q1, 0, $noreg, undef renamable $q1
   ; CHECK:   renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg
   ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4)
   ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
   ; CHECK: bb.3.exit:
-  ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+  ; CHECK:   liveins: $r4
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def dead $r7, def $pc
   bb.0.entry:
     successors: %bb.1(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3, $r7, $lr
 
-    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
-    frame-setup CFI_INSTRUCTION offset $r4, -8
+    frame-setup CFI_INSTRUCTION offset $r7, -8
     tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
     t2IT 11, 8, implicit-def $itstate
-    tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
 
   bb.1.loop.ph:
     successors: %bb.2(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3
 
-    renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
-    $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+    renamable $lr = t2DoLoopStart killed renamable $lr
+    $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg
 
   bb.2.loop.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
     liveins: $r0, $r1, $r2, $r3, $r12
 
-    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
     $lr = tMOVr $r12, 14 /* CC::al */, $noreg
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
     MVE_VPST 4, implicit $vpr
     renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4)
     renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4)
-    renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+    renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
     renamable $q1 = MVE_VCLZs16 killed renamable $q1, 0, $noreg, undef renamable $q1
     renamable $lr = t2LoopDec killed renamable $lr, 1
     renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg
@@ -338,7 +339,7 @@ body:             |
     tB %bb.3, 14 /* CC::al */, $noreg
 
   bb.3.exit:
-    tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
 
 ...
 ---
@@ -373,68 +374,69 @@ body:             |
   ; CHECK-LABEL: name: test_ctlz_i32
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r4
-  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
-  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
   ; CHECK:   tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
   ; CHECK:   t2IT 11, 8, implicit-def $itstate
-  ; CHECK:   tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate
   ; CHECK: bb.1.loop.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-  ; CHECK:   dead $lr = t2DLS renamable $r4
-  ; CHECK:   $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4
+  ; CHECK:   renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg
   ; CHECK: bb.2.loop.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r12
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r12
   ; CHECK:   $lr = tMOVr $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
   ; CHECK:   MVE_VPST 4, implicit $vpr
   ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4)
   ; CHECK:   renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4)
-  ; CHECK:   renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $q1 = MVE_VCLZs32 killed renamable $q1, 0, $noreg, undef renamable $q1
   ; CHECK:   renamable $q0 = MVE_VQSHRUNs32th killed renamable $q0, killed renamable $q1, 3, 0, $noreg
   ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4)
   ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
   ; CHECK: bb.3.exit:
-  ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+  ; CHECK:   liveins: $r4
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def dead $r7, def $pc
   bb.0.entry:
     successors: %bb.1(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3, $r7, $lr
 
-    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
-    frame-setup CFI_INSTRUCTION offset $r4, -8
+    frame-setup CFI_INSTRUCTION offset $r7, -8
     tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
     t2IT 11, 8, implicit-def $itstate
-    tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
 
   bb.1.loop.ph:
     successors: %bb.2(0x80000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+    liveins: $r0, $r1, $r2, $r3
 
-    renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
-    $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+    renamable $lr = t2DoLoopStart killed renamable $lr
+    $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg
 
   bb.2.loop.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
     liveins: $r0, $r1, $r2, $r3, $r12
 
-    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
     $lr = tMOVr $r12, 14 /* CC::al */, $noreg
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
     MVE_VPST 4, implicit $vpr
     renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4)
     renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4)
-    renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+    renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
     renamable $q1 = MVE_VCLZs32 killed renamable $q1, 0, $noreg, undef renamable $q1
     renamable $lr = t2LoopDec killed renamable $lr, 1
     renamable $q0 = MVE_VQSHRUNs32th killed renamable $q0, killed renamable $q1, 3, 0, $noreg
@@ -444,6 +446,6 @@ body:             |
     tB %bb.3, 14 /* CC::al */, $noreg
 
   bb.3.exit:
-    tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
 
 ...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
index 550972e4a4f45..1e89d73b684e1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
@@ -19,7 +19,7 @@
     br i1 %tmp, label %bb27, label %bb3
 
   bb3:                                              ; preds = %bb
-    call void @llvm.set.loop.iterations.i32(i32 %tmp6)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp6)
     %scevgep1 = getelementptr i32, i32* %arg3, i32 -4
     br label %bb9
 
@@ -27,7 +27,7 @@
     %lsr.iv4 = phi i32* [ %scevgep6, %bb9 ], [ %scevgep1, %bb3 ]
     %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ]
     %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
-    %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ]
+    %tmp7 = phi i32 [ %start, %bb3 ], [ %tmp12, %bb9 ]
     %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
     %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
@@ -56,7 +56,7 @@
   }
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
   declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
@@ -197,7 +197,7 @@ body:             |
     VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
     renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0
     $r3 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.bb9:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
index 8b69e1840ad21..c4d864248bd99 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
@@ -11,14 +11,14 @@
     %2 = sub i32 %0, %smin
     %3 = lshr i32 %2, 2
     %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %do.body
 
   do.body:                                          ; preds = %do.body, %entry
     %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
     %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
     %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
-    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %5 = phi i32 [ %start, %entry ], [ %9, %do.body ]
     %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
     %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
     %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
@@ -38,7 +38,7 @@
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
   declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
   declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -136,7 +136,7 @@ body:             |
     renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
     renamable $r2 = tLEApcrel %const.0, 14, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.1.do.body (align 4):
     successors: %bb.1(0x7c000000), %bb.2(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
index de607549411e2..6fc0093224e76 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
@@ -19,14 +19,14 @@
     br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
     %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@@ -47,7 +47,7 @@
   for.cond.cleanup:                                 ; preds = %vector.body, %entry
     ret void
   }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare <4 x i1> @llvm.arm.vctp32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
@@ -162,7 +162,7 @@ body:             |
     renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir
index d658f52a9e25b..8fee094ee4643 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir
@@ -24,14 +24,14 @@
     %5 = sub i32 %3, %smin36
     %6 = lshr i32 %5, 2
     %7 = add nuw nsw i32 %6, 1
-    call void @llvm.set.loop.iterations.i32(i32 %7)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %7)
     br label %do.body
   
   do.body:                                          ; preds = %do.body, %entry
     %count.0 = phi i32 [ %0, %entry ], [ %12, %do.body ]
     %pInT.0 = phi float* [ %pIn, %entry ], [ %add.ptr, %do.body ]
     %sumVec.0 = phi <4 x float> [ zeroinitializer, %entry ], [ %11, %do.body ]
-    %8 = phi i32 [ %7, %entry ], [ %13, %do.body ]
+    %8 = phi i32 [ %start1, %entry ], [ %13, %do.body ]
     %pInT.033 = bitcast float* %pInT.0 to <4 x float>*
     %9 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %count.0)
     %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pInT.033, i32 4, <4 x i1> %9, <4 x float> zeroinitializer)
@@ -125,7 +125,7 @@
     %50 = bitcast float* %arrayidx17 to <4 x float>*
     %51 = load <4 x float>, <4 x float>* %50, align 4
     %52 = fmul fast <4 x float> %51, %40
-    call void @llvm.set.loop.iterations.i32(i32 %33)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %33)
     br label %do.body24
   
   do.body24:                                        ; preds = %do.body24, %for.body
@@ -138,7 +138,7 @@
     %sumVec1.0 = phi <4 x float> [ %46, %for.body ], [ %58, %do.body24 ]
     %sumVec2.0 = phi <4 x float> [ %49, %for.body ], [ %60, %do.body24 ]
     %sumVec3.0 = phi <4 x float> [ %52, %for.body ], [ %62, %do.body24 ]
-    %53 = phi i32 [ %33, %for.body ], [ %63, %do.body24 ]
+    %53 = phi i32 [ %start2, %for.body ], [ %63, %do.body24 ]
     %lsr.iv4 = bitcast float* %lsr.iv to <4 x float>*
     %lsr.iv911 = bitcast float* %lsr.iv9 to <4 x float>*
     %lsr.iv1618 = bitcast float* %lsr.iv16 to <4 x float>*
@@ -219,7 +219,7 @@
     %k.1200 = phi i32 [ %inc, %do.end66 ], [ %k.0.lcssa, %for.body56.preheader ]
     %mul57 = mul i32 %k.1200, %0
     %arrayidx58 = getelementptr inbounds float, float* %2, i32 %mul57
-    call void @llvm.set.loop.iterations.i32(i32 %38)
+    %start3 = call i32 @llvm.start.loop.iterations.i32(i32 %38)
     br label %do.body59
   
   do.body59:                                        ; preds = %do.body59, %for.body56
@@ -227,7 +227,7 @@
     %pInT.2 = phi float* [ %pIn, %for.body56 ], [ %add.ptr61, %do.body59 ]
     %pCos0.1 = phi float* [ %arrayidx58, %for.body56 ], [ %add.ptr62, %do.body59 ]
     %sumVec.1 = phi <4 x float> [ zeroinitializer, %for.body56 ], [ %93, %do.body59 ]
-    %89 = phi i32 [ %38, %for.body56 ], [ %95, %do.body59 ]
+    %89 = phi i32 [ %start3, %for.body56 ], [ %95, %do.body59 ]
     %pInT.21 = bitcast float* %pInT.2 to <4 x float>*
     %pCos0.12 = bitcast float* %pCos0.1 to <4 x float>*
     %90 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %count.2)
@@ -264,7 +264,7 @@
   declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
   declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #3
   declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) #1
-  declare void @llvm.set.loop.iterations.i32(i32) #4
+  declare i32 @llvm.start.loop.iterations.i32(i32) #4
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4
 
 ...
@@ -414,7 +414,7 @@ body:             |
     $r0 = tMOVr $r4, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
     $r1 = tMOVr $r5, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
   
   bb.1.do.body (align 4):
     successors: %bb.1(0x7c000000), %bb.2(0x04000000)
@@ -503,7 +503,7 @@ body:             |
     $r3 = tMOVr $r10, 14 /* CC::al */, $noreg
     $r5 = tMOVr $r1, 14 /* CC::al */, $noreg
     $r4 = tMOVr $r12, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     $r7 = tMOVr $r6, 14 /* CC::al */, $noreg
     renamable $r11 = t2LDRi12 $sp, 16, 14 /* CC::al */, $noreg :: (load 4 from %stack.5)
   
@@ -592,7 +592,7 @@ body:             |
     $r6 = tMOVr $r4, 14 /* CC::al */, $noreg
     $r7 = tMOVr $r5, 14 /* CC::al */, $noreg
     $lr = tMOVr $r3, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
 
   bb.13:
     successors: %bb.10(0x80000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir
index 4595fa75d7125..5e4c960a6413a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir
@@ -7,7 +7,7 @@
 
   define void @size_limit(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
   entry:
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
     %scevgep = getelementptr i32, i32* %a, i32 -1
     %scevgep4 = getelementptr i32, i32* %c, i32 -1
     %scevgep8 = getelementptr i32, i32* %b, i32 -1
@@ -35,7 +35,7 @@
     %lsr.iv9 = phi i32* [ %scevgep8, %entry ], [ %scevgep10, %for.body ]
     %lsr.iv5 = phi i32* [ %scevgep4, %entry ], [ %scevgep6, %for.body ]
     %lsr.iv1 = phi i32* [ %scevgep, %entry ], [ %scevgep2, %for.body ]
-    %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+    %count = phi i32 [ %start, %entry ], [ %count.next, %for.body ]
     br label %for.body
   }
 
@@ -43,7 +43,7 @@
   declare i32 @llvm.arm.space(i32 immarg, i32) #0
 
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
 
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@@ -184,7 +184,7 @@ body:             |
     frame-setup CFI_INSTRUCTION offset $r7, -8
     $sp = frame-setup tSUBspi $sp, 8, 14, $noreg
     frame-setup CFI_INSTRUCTION def_cfa_offset 40
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg
     renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
     renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
index 3e398eefb0924..9557f9a2acf45 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
@@ -8,21 +8,21 @@ define void @foo(%struct.SpeexPreprocessState_* nocapture readonly %st, i16* %x)
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    ldrd r12, r4, [r0]
-; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
+; CHECK-NEXT:    ldrd r2, r3, [r0, #8]
 ; CHECK-NEXT:    rsb r12, r12, r4, lsl #1
+; CHECK-NEXT:    dlstp.16 lr, r12
 ; CHECK-NEXT:    mov r4, r12
-; CHECK-NEXT:    dlstp.16 lr, r4
 ; CHECK-NEXT:  .LBB0_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q0, [r2], #16
-; CHECK-NEXT:    vstrh.16 q0, [r3], #16
+; CHECK-NEXT:    vldrh.u16 q0, [r3], #16
+; CHECK-NEXT:    vstrh.16 q0, [r2], #16
 ; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
-; CHECK-NEXT:    ldr r3, [r0]
+; CHECK-NEXT:    ldr r2, [r0]
 ; CHECK-NEXT:    ldr r0, [r0, #8]
 ; CHECK-NEXT:    vmov.i16 q0, #0x1800
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
-; CHECK-NEXT:    dlstp.16 lr, r3
+; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB0_3: @ %do.body6
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #16

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
index 2627965913ebc..33fee9833b021 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
@@ -9,8 +9,8 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #8
@@ -69,8 +69,8 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #8
@@ -129,8 +129,8 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #4
@@ -189,8 +189,8 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r3, #4

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
index 712faa59fb7d5..29055d3489d18 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
@@ -17,11 +17,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
@@ -52,7 +52,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 
@@ -155,7 +155,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 7101cebbb9793..c45b4ff748a4c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -49,10 +49,10 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader.new
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
-; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    add.w r3, r12, r3, lsr #2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, r1, r3
@@ -228,9 +228,9 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@@ -321,11 +321,12 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
 ; CHECK-NEXT:    sub.w r12, r2, #1
 ; CHECK-NEXT:    adr r2, .LCPI2_1
-; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    mov lr, r3
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vdup.32 q2, r12
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir
index 352fad145dc4a..0f0c144b0f771 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir
@@ -13,14 +13,14 @@
     br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv17 = phi i16* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
     %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i16* %lsr.iv to <8 x i16>*
     %lsr.iv1416 = bitcast i16* %lsr.iv14 to <8 x i16>*
@@ -41,7 +41,7 @@
   for.cond.cleanup:                                 ; preds = %vector.body, %entry
     ret void
   }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
@@ -149,7 +149,7 @@ body:             |
     renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir
index 993291931ffad..b7f7e7e1a8975 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir
@@ -20,14 +20,14 @@
     br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
     %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@@ -48,7 +48,7 @@
   for.cond.cleanup:                                 ; preds = %vector.body, %entry
     ret void
   }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@@ -157,7 +157,7 @@ body:             |
     renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir
index ec9d831795c65..724fec524feae 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir
@@ -13,14 +13,14 @@
     br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv17 = phi i8* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
     %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i8* %lsr.iv to <16 x i8>*
     %lsr.iv1416 = bitcast i8* %lsr.iv14 to <16 x i8>*
@@ -41,7 +41,7 @@
   for.cond.cleanup:                                 ; preds = %vector.body, %entry
     ret void
   }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare <16 x i1> @llvm.arm.mve.vctp8(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
@@ -150,7 +150,7 @@ body:             |
     renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir
index 70f668f197fa4..4d76707fe7ae6 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir
@@ -16,11 +16,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
     %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
     %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
@@ -64,7 +64,7 @@
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -201,7 +201,7 @@ body:             |
     renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
     renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     $r4 = tMOVr killed $lr, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir
index 32404d165c3b1..49b958767a3b5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir
@@ -16,11 +16,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
     %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
     %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
@@ -64,7 +64,7 @@
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -201,7 +201,7 @@ body:             |
     renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
     renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     $r4 = tMOVr killed $lr, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir
index 9b186540773cf..d9ffdf475c067 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir
@@ -16,11 +16,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
     %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
     %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
@@ -64,7 +64,7 @@
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -201,7 +201,7 @@ body:             |
     renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
     renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     $r4 = tMOVr killed $lr, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
index ab3c866b015d7..7b9caf8bc1989 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
@@ -16,11 +16,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
     %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
     %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
@@ -65,7 +65,7 @@
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -200,7 +200,7 @@ body:             |
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
     renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
     renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r5
+    $lr = t2DoLoopStart renamable $r5
     $r4 = tMOVr killed $r5, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
index b796712aa6ac0..2f9216eef6f5a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
@@ -18,11 +18,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
     %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
     %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
@@ -67,7 +67,7 @@
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -201,7 +201,7 @@ body:             |
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
     renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
     renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r5
+    $lr = t2DoLoopStart renamable $r5
     $r4 = tMOVr killed $r5, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
index 2b354005535ed..76a3752a34a20 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
@@ -14,11 +14,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv20 = phi i32* [ %scevgep20, %vector.body ], [ %c, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
@@ -55,11 +55,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
@@ -92,11 +92,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
@@ -120,7 +120,7 @@
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 
@@ -204,7 +204,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -321,7 +321,7 @@ body:             |
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r1 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $r1 = tMOVr killed $r3, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -440,7 +440,7 @@ body:             |
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r1 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $r1 = tMOVr killed $r3, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
index c2de31ddef1f2..e548a30cbc735 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
@@ -12,11 +12,11 @@
     %3 = lshr i32 %2, 2
     %4 = add nuw nsw i32 %3, 1
     store i32 %4, i32* %iter.addr, align 4
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %do.body
 
   do.body:                                          ; preds = %do.body, %entry
-    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %4, %entry ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %start, %entry ]
     %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
     %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
     %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
@@ -47,12 +47,12 @@
     %2 = sub i32 %0, %smin
     %3 = lshr i32 %2, 2
     %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     store i32 %4, i32* %iter.addr, align 4
     br label %do.body
 
   do.body:                                          ; preds = %do.body, %entry
-    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %4, %entry ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %start, %entry ]
     %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
     %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
     %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
@@ -84,7 +84,7 @@
   declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #3
 
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #4
+  declare i32 @llvm.start.loop.iterations.i32(i32) #4
 
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4
@@ -178,7 +178,7 @@ body:             |
     renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg
     t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg
 
   bb.1.do.body:
@@ -247,8 +247,8 @@ body:             |
   ; CHECK:   renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr)
+  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg
   ; CHECK: bb.1.do.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
@@ -282,8 +282,8 @@ body:             |
     renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
     renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $lr
     t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr)
+    $lr = t2DoLoopStart renamable $lr
     $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg
 
   bb.1.do.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir
index 047701aaa75bd..a3f176e5af52e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir
@@ -13,14 +13,14 @@
     %2 = sub i32 %0, %smin
     %3 = lshr i32 %2, 2
     %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %do.body
 
   do.body:                                          ; preds = %do.body, %entry
     %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
     %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
     %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
-    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %5 = phi i32 [ %start, %entry ], [ %9, %do.body ]
     %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
     %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
     %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
@@ -40,7 +40,7 @@
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
   declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
   declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -149,7 +149,7 @@ body:             |
     renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
     renamable $r2 = tLEApcrel %const.0, 14, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.1.do.body (align 4):
     successors: %bb.1(0x7c000000), %bb.2(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir
index d5bc54820182a..dc2dc8a80b744 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir
@@ -14,14 +14,14 @@
     %2 = sub i32 %0, %smin
     %3 = lshr i32 %2, 2
     %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %do.body
 
   do.body:                                          ; preds = %do.body, %entry
     %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
     %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
     %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
-    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %5 = phi i32 [ %start, %entry ], [ %9, %do.body ]
     %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
     %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
     %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
@@ -41,7 +41,7 @@
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
   declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
   declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -140,7 +140,7 @@ body:             |
     renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
     renamable $r2 = tLEApcrel %const.0, 14, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.1.do.body (align 4):
     successors: %bb.1(0x7c000000), %bb.2(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir
index 2ee932acb840a..6e73fcbc5a5a2 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir
@@ -78,6 +78,7 @@ body:             |
   ; CHECK:   successors: %bb.5(0x80000000)
   ; CHECK:   liveins: $q0, $r0, $r1, $r2, $r4
   ; CHECK:   renamable $s4 = nnan ninf nsz VADDS renamable $s0, renamable $s1, 14 /* CC::al */, $noreg
+  ; CHECK:   dead $lr = tMOVr $r4, 14 /* CC::al */, $noreg
   ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $s4 = nnan ninf nsz VADDS renamable $s2, killed renamable $s4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $s0 = nnan ninf nsz VADDS killed renamable $s3, killed renamable $s4, 14 /* CC::al */, $noreg, implicit killed $q0
@@ -151,7 +152,7 @@ body:             |
     renamable $r4 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
     $r3 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.3:
     successors: %bb.3(0x7c000000), %bb.4(0x04000000)
@@ -178,7 +179,7 @@ body:             |
     renamable $s0 = nnan ninf nsz VADDS killed renamable $s3, killed renamable $s4, 14, $noreg, implicit $q0
     $s2 = VMOVSR $r1, 14, $noreg
     renamable $s2 = VUITOS killed renamable $s2, 14, $noreg
-    t2DoLoopStart killed $r4
+    $lr = t2DoLoopStart killed $r4
     renamable $s4 = nnan ninf nsz VDIVS killed renamable $s0, killed renamable $s2, 14, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir
index d8f40301d9de3..285b2bad8c061 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir
@@ -15,14 +15,14 @@
     %2 = sub i32 %0, %smin
     %3 = lshr i32 %2, 2
     %4 = add nuw nsw i32 %3, 1
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %do.body
 
   do.body:                                          ; preds = %do.body, %entry
     %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
     %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
     %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
-    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %5 = phi i32 [ %start, %entry ], [ %9, %do.body ]
     %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
     %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
     %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
@@ -42,7 +42,7 @@
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
   declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
   declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -149,7 +149,7 @@ body:             |
     renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
     renamable $r2 = tLEApcrel %const.0, 14, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.1.do.body (align 4):
     successors: %bb.1(0x7c000000), %bb.2(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir
index b71604c0be448..f4b64f4f56ec4 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir
@@ -20,11 +20,11 @@
     %trip.count.minus.1 = add i32 %N, -1
     %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
     %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv3 = phi i32* [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
@@ -56,7 +56,7 @@
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -168,7 +168,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $r4 = tMOVr killed $r3, 14 /* CC::al */, $noreg
     renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir
index a04d335c6b2a3..0c1f7e41f1b4c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir
@@ -18,11 +18,11 @@
     %trip.count.minus.1 = add i32 %N, -1
     %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
     %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv3 = phi i32* [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
@@ -54,7 +54,7 @@
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 ...
 ---
@@ -165,7 +165,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $r4 = tMOVr killed $r3, 14 /* CC::al */, $noreg
     renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir
index a1a1e785672db..084a0c925f97f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir
@@ -19,7 +19,7 @@
     %trip.count.minus.1 = add i32 %N, -1
     %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
     %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -27,7 +27,7 @@
     %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
     %elts.rem = phi i32 [ %N, %vector.ph ], [ %elts.rem.next, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %12, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %12, %vector.body ]
     %lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>*
     %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>*
     %7 = insertelement <4 x i32> undef, i32 %div, i32 0
@@ -52,7 +52,7 @@
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -147,7 +147,7 @@ body:             |
     renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool)
     renamable $r3, dead $cpsr = tLSRri renamable $r2, 1, 14 /* CC::al */, $noreg
     renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir
index 3844cac0f906f..d86195c98b116 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir
@@ -14,11 +14,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv20 = phi i32* [ %scevgep20, %vector.body ], [ %c, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
@@ -50,7 +50,7 @@
 
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 
@@ -136,7 +136,7 @@ body:             |
     renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $r4 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r4
+    $lr = t2DoLoopStart renamable $r4
     $r12 = tMOVr killed $r4, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir
index 6a607e28cdb7a..fc914f7920f1f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir
@@ -18,7 +18,7 @@
     br i1 %tmp7, label %bb13, label %bb12
 
   bb12:                                             ; preds = %bb4
-    call void @llvm.set.loop.iterations.i32(i32 %tmp11)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp11)
     br label %bb28
 
   bb13:                                             ; preds = %bb28, %bb4
@@ -45,7 +45,7 @@
     ret void
 
   bb28:                                             ; preds = %bb28, %bb12
-    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %tmp11, %bb12 ]
+    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ]
     %lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ]
     %tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ]
     %0 = bitcast i32* %arg1 to i8*
@@ -145,7 +145,7 @@
     br label %bb27
   }
 
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -387,7 +387,7 @@ body:             |
     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
     renamable $r8 = t2MOVi 0, 14, $noreg, $noreg
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $r12 = tMOVr killed $r3, 14, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir
index f2cb5547c7dd6..7662bb5ae037a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir
@@ -18,7 +18,7 @@
     br i1 %tmp7, label %bb13, label %bb12
 
   bb12:                                             ; preds = %bb4
-    call void @llvm.set.loop.iterations.i32(i32 %tmp11)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp11)
     br label %bb28
 
   bb13:                                             ; preds = %bb28, %bb4
@@ -46,7 +46,7 @@
     ret i32 %res
 
   bb28:                                             ; preds = %bb28, %bb12
-    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %tmp11, %bb12 ]
+    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ]
     %lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ]
     %tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ]
     %0 = bitcast i32* %arg1 to i8*
@@ -146,7 +146,7 @@
     br label %bb27
   }
 
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -265,7 +265,8 @@ body:             |
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $r3
+  ; CHECK:   dead $lr = t2DLS renamable $r3
+  ; CHECK:   $lr = tMOVr killed $r3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
   ; CHECK: bb.5.bb28:
   ; CHECK:   successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@@ -403,7 +404,7 @@ body:             |
     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
     renamable $r8 = t2MOVi 0, 14, $noreg, $noreg
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $lr = tMOVr killed $r3, 14, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir
index d391e8211eb3c..06015ba2d69c3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir
@@ -18,7 +18,7 @@
     br i1 %tmp7, label %bb13, label %bb12
 
   bb12:                                             ; preds = %bb4
-    call void @llvm.set.loop.iterations.i32(i32 %tmp11)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp11)
     br label %bb28
 
   bb13:                                             ; preds = %bb28, %bb4
@@ -46,7 +46,7 @@
     ret i32 %res
 
   bb28:                                             ; preds = %bb28, %bb12
-    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %tmp11, %bb12 ]
+    %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ]
     %lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ]
     %tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ]
     %0 = bitcast i32* %arg1 to i8*
@@ -146,7 +146,7 @@
     br label %bb27
   }
 
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -265,7 +265,8 @@ body:             |
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $r3
+  ; CHECK:   dead $lr = t2DLS renamable $r3
+  ; CHECK:   $lr = tMOVr killed $r3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
   ; CHECK: bb.5.bb28:
   ; CHECK:   successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@@ -403,7 +404,7 @@ body:             |
     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
     renamable $r8 = t2MOVi 0, 14, $noreg, $noreg
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $lr = tMOVr $r3, 14, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
index 62da471339454..9991bea38e969 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
 ; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-GLOBAL
 
@@ -16,10 +15,10 @@
 ; CHECK: ne_and_guard
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tCMPi8 renamable $r0, 0
 ; CHECK:   tBcc %bb.4
 ; CHECK: bb.2.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r0
 ; CHECK: bb.3.while.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.3
 define void @ne_and_guard(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
@@ -49,10 +48,10 @@ if.end:                                           ; preds = %while.body, %entry
 ; CHECK: ne_preheader
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tCMPi8 renamable $r0, 0
 ; CHECK:   tBcc %bb.4
 ; CHECK: bb.2.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r0
 ; CHECK: bb.3.while.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.3
 define void @ne_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
@@ -84,10 +83,10 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: eq_preheader
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tCMPi8 renamable $r0, 0
 ; CHECK:   tBcc %bb.4
 ; CHECK: bb.2.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r0
 ; CHECK: bb.3.while.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.3
 define void @eq_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
@@ -119,10 +118,10 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: ne_prepreheader
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   t2CMPri renamable $r12, 0
 ; CHECK:   tBcc %bb.4
 ; CHECK: bb.2.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r12
 ; CHECK: bb.3.while.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.3
 define void @ne_prepreheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
@@ -153,7 +152,7 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: be_ne
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $r12
 ; CHECK: bb.2.do.body:
 ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
 define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
index c4a372d790e48..3191349762a08 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
@@ -15,14 +15,14 @@
 
   vector.ph:                                        ; preds = %entry
     %6 = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %init, i32 0
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv13 = phi float* [ %scevgep14, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi float* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x float> [ %6, %vector.ph ], [ %13, %vector.body ]
-    %7 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ]
+    %7 = phi i32 [ %start, %vector.ph ], [ %14, %vector.body ]
     %8 = phi i32 [ %N, %vector.ph ], [ %10, %vector.body ]
     %lsr.iv12 = bitcast float* %lsr.iv to <4 x float>*
     %lsr.iv1315 = bitcast float* %lsr.iv13 to <4 x float>*
@@ -63,14 +63,14 @@
 
   vector.ph:                                        ; preds = %entry
     %6 = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %init, i32 0
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv14 = phi float* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi float* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x float> [ %6, %vector.ph ], [ %13, %vector.body ]
-    %7 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ]
+    %7 = phi i32 [ %start, %vector.ph ], [ %14, %vector.body ]
     %8 = phi i32 [ %shr, %vector.ph ], [ %10, %vector.body ]
     %lsr.iv13 = bitcast float* %lsr.iv to <4 x float>*
     %lsr.iv1416 = bitcast float* %lsr.iv14 to <4 x float>*
@@ -99,7 +99,7 @@
   declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
   declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
   declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 
@@ -205,7 +205,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
     $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1
 
@@ -341,7 +341,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
     renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 2, 14 /* CC::al */, $noreg
     $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir
index d7a6d331b5358..cdaeeced89d8e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir
@@ -16,7 +16,7 @@
     %scevgep = getelementptr i32, i32* %a, i32 -1
     %scevgep4 = getelementptr i32, i32* %c, i32 -1
     %scevgep8 = getelementptr i32, i32* %b, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
     br label %for.body
 
   for.cond.cleanup:                                 ; preds = %for.body, %entry
@@ -26,7 +26,7 @@
     %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
     %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
     %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-    %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.body ]
+    %0 = phi i32 [ %start, %for.body.preheader ], [ %3, %for.body ]
     %size = call i32 @llvm.arm.space(i32 4096, i32 undef)
     %scevgep3 = getelementptr i32, i32* %lsr.iv9, i32 1
     %1 = load i32, i32* %scevgep3, align 4
@@ -47,7 +47,7 @@
   declare i32 @llvm.arm.space(i32 immarg, i32) #0
 
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
 
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@@ -157,7 +157,7 @@ body:             |
     renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
     renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg
     $lr = tMOVr $r3, 14, $noreg
-    t2DoLoopStart killed $r3
+    $lr = t2DoLoopStart killed $r3
 
   bb.2.for.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir
index e3eb367f68de2..843ec0089c122 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir
@@ -26,14 +26,14 @@
     call void @llvm.dbg.value(metadata i32 0, metadata !31, metadata !DIExpression()), !dbg !32
     %arrayidx7.us = getelementptr inbounds i32, i32* %e, i32 %i.031.us, !dbg !38
     %arrayidx7.promoted.us = load i32, i32* %arrayidx7.us, align 4, !dbg !41
-    call void @llvm.set.loop.iterations.i32(i32 %d), !dbg !46
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %d), !dbg !46
     br label %for.body3.us, !dbg !46
 
   for.body3.us:                                     ; preds = %for.body3.us, %for.cond1.preheader.us
     %lsr.iv5 = phi i16* [ %scevgep6, %for.body3.us ], [ %lsr.iv2, %for.cond1.preheader.us ], !dbg !32
     %lsr.iv1 = phi i16* [ %scevgep, %for.body3.us ], [ %l, %for.cond1.preheader.us ], !dbg !32
     %add829.us = phi i32 [ %arrayidx7.promoted.us, %for.cond1.preheader.us ], [ %add8.us, %for.body3.us ], !dbg !32
-    %1 = phi i32 [ %d, %for.cond1.preheader.us ], [ %4, %for.body3.us ], !dbg !32
+    %1 = phi i32 [ %start, %for.cond1.preheader.us ], [ %4, %for.body3.us ], !dbg !32
     call void @llvm.dbg.value(metadata i32 undef, metadata !31, metadata !DIExpression()), !dbg !32
     %2 = load i16, i16* %lsr.iv5, align 2, !dbg !47
     %conv.us = sext i16 %2 to i32, !dbg !47
@@ -67,7 +67,7 @@
   }
   declare !dbg !4 dso_local arm_aapcscc signext i16 @get_input(i32, i32*, i16 signext)
   declare void @llvm.dbg.value(metadata, metadata, metadata)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
   !llvm.dbg.cu = !{!0}
@@ -325,7 +325,7 @@ body:             |
     $r3 = tMOVr $r5, 14, $noreg, debug-location !32
     $r0 = tMOVr $r8, 14, $noreg, debug-location !32
     $lr = tMOVr $r10, 14, $noreg, debug-location !32
-    t2DoLoopStart renamable $r10, debug-location !46
+    $lr = t2DoLoopStart renamable $r10, debug-location !46
 
   bb.3.for.body3.us:
     successors: %bb.3(0x7c000000), %bb.4(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
index f9d1abbee9e54..0acad61f8be3f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
@@ -10,7 +10,7 @@
     br i1 %cmp19.i, label %for.body.i.preheader, label %c.exit.thread
 
   for.body.i.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %d)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %d)
     br label %for.body.i
 
   c.exit.thread:                                    ; preds = %entry
@@ -22,7 +22,7 @@
     %lsr.iv15 = phi i32* [ %e, %for.body.i.preheader ], [ %scevgep16, %for.body.i ]
     %h.022.i = phi i16 [ %h.1.i, %for.body.i ], [ 0, %for.body.i.preheader ]
     %f.020.i = phi i32 [ %f.1.i, %for.body.i ], [ undef, %for.body.i.preheader ]
-    %0 = phi i32 [ %d, %for.body.i.preheader ], [ %2, %for.body.i ]
+    %0 = phi i32 [ %start1, %for.body.i.preheader ], [ %2, %for.body.i ]
     %1 = load i32, i32* %lsr.iv15, align 4
     %add.i = add nsw i32 %1, %f.020.i
     %cmp1.i = icmp sgt i32 %add.i, 0
@@ -60,14 +60,14 @@
     %arrayidx12.us = getelementptr inbounds i32, i32* %e, i32 %i.064.us
     %arrayidx12.promoted.us = load i32, i32* %arrayidx12.us, align 4
     %11 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx12.promoted.us, i32 0
-    call void @llvm.set.loop.iterations.i32(i32 %8)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %8)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %for.cond4.preheader.us
     %lsr.iv10 = phi i16* [ %scevgep11, %vector.body ], [ %lsr.iv7, %for.cond4.preheader.us ]
     %lsr.iv4 = phi i16* [ %scevgep5, %vector.body ], [ %l, %for.cond4.preheader.us ]
     %vec.phi = phi <4 x i32> [ %11, %for.cond4.preheader.us ], [ %19, %vector.body ]
-    %12 = phi i32 [ %8, %for.cond4.preheader.us ], [ %20, %vector.body ]
+    %12 = phi i32 [ %start2, %for.cond4.preheader.us ], [ %20, %vector.body ]
     %13 = phi i32 [ %d, %for.cond4.preheader.us ], [ %15, %vector.body ]
     %lsr.iv1012 = bitcast i16* %lsr.iv10 to <4 x i16>*
     %lsr.iv46 = bitcast i16* %lsr.iv4 to <4 x i16>*
@@ -108,14 +108,14 @@
     br i1 %29, label %for.body.i57.preheader, label %c.exit59
 
   for.body.i57.preheader:                           ; preds = %for.end16
-    call void @llvm.set.loop.iterations.i32(i32 %d)
+    %start3 = call i32 @llvm.start.loop.iterations.i32(i32 %d)
     br label %for.body.i57
 
   for.body.i57:                                     ; preds = %for.body.i57, %for.body.i57.preheader
     %lsr.iv1 = phi i32* [ %e, %for.body.i57.preheader ], [ %scevgep, %for.body.i57 ]
     %h.022.i44 = phi i16 [ %h.1.i54, %for.body.i57 ], [ 0, %for.body.i57.preheader ]
     %f.020.i46 = phi i32 [ %f.1.i51, %for.body.i57 ], [ undef, %for.body.i57.preheader ]
-    %30 = phi i32 [ %d, %for.body.i57.preheader ], [ %32, %for.body.i57 ]
+    %30 = phi i32 [ %start3, %for.body.i57.preheader ], [ %32, %for.body.i57 ]
     %31 = load i32, i32* %lsr.iv1, align 4
     %add.i48 = add nsw i32 %31, %f.020.i46
     %cmp1.i49 = icmp sgt i32 %add.i48, 0
@@ -142,7 +142,7 @@
   declare dso_local arm_aapcs_vfpcc signext i16 @crc16(...) local_unnamed_addr #0
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -385,7 +385,7 @@ body:             |
     renamable $r2 = IMPLICIT_DEF
     $r10 = tMOVr $r0, 14, $noreg
     $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart killed renamable $r0
+    $lr = t2DoLoopStart killed renamable $r0
 
   bb.2.for.body.i:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -443,7 +443,7 @@ body:             |
     $r6 = tMOVr $r5, 14, $noreg
     $r1 = tMOVr $r8, 14, $noreg
     $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $r0
+    $lr = t2DoLoopStart renamable $r0
 
   bb.6.vector.body:
     successors: %bb.6(0x7c000000), %bb.7(0x04000000)
@@ -488,7 +488,7 @@ body:             |
 
     renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg
     renamable $r1 = IMPLICIT_DEF
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.10.for.body.i57:
     successors: %bb.10(0x7c000000), %bb.11(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir
index 3409156b79e7d..bdd81cbf12d7e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir
@@ -9,13 +9,13 @@
   entry:
     %scevgep = getelementptr i32, i32* %q, i32 -1
     %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
     br label %while.body
 
   while.body:                                       ; preds = %while.body, %entry
     %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
     %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
-    %0 = phi i32 [ %n, %entry ], [ %2, %while.body ]
+    %0 = phi i32 [ %start, %entry ], [ %2, %while.body ]
     %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
     %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
     %1 = load i32, i32* %scevgep6, align 4
@@ -30,7 +30,7 @@
     ret i32 0
   }
 
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
 
   attributes #0 = { noduplicate nounwind }
@@ -91,7 +91,8 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   $lr = t2DLS killed $r0
+  ; CHECK:   dead $lr = t2DLS $r0
+  ; CHECK:   $lr = tMOVr killed $r0, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg
   ; CHECK: bb.1.while.body:
@@ -111,7 +112,7 @@ body:             |
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
-    t2DoLoopStart $r0
+    $lr = t2DoLoopStart $r0
     $lr = tMOVr killed $r0, 14, $noreg
     renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
     renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
index 94e3e26c819d6..22e53267b619c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops -tail-predication=enabled %s -o - | FileCheck %s
 
+# TODOD: As far as I can tell this test is fine. The tail predicating the second loop means we remove the instruction that would otherwise block the first.
+
 --- |
   define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) #0 {
   entry:
@@ -15,14 +17,14 @@
     %6 = sub i32 %0, %smin3
     %7 = lshr i32 %6, 2
     %8 = add nuw nsw i32 %7, 1
-    call void @llvm.set.loop.iterations.i32(i32 %8)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %8)
     br label %do.body.i
 
   do.body.i:                                        ; preds = %do.body.i, %entry
     %blkCnt.0.i = phi i32 [ %13, %do.body.i ], [ %blockSize, %entry ]
     %sumVec.0.i = phi <4 x float> [ %12, %do.body.i ], [ zeroinitializer, %entry ]
     %pSrc.addr.0.i = phi float* [ %add.ptr.i, %do.body.i ], [ %pSrc, %entry ]
-    %9 = phi i32 [ %8, %entry ], [ %14, %do.body.i ]
+    %9 = phi i32 [ %start1, %entry ], [ %14, %do.body.i ]
     %pSrc.addr.0.i2 = bitcast float* %pSrc.addr.0.i to <4 x float>*
     %10 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0.i)
     %11 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.0.i2, i32 4, <4 x i1> %10, <4 x float> zeroinitializer)
@@ -42,14 +44,14 @@
     %18 = insertelement <4 x i32> undef, i32 %17, i64 0
     %19 = shufflevector <4 x i32> %18, <4 x i32> undef, <4 x i32> zeroinitializer
     %20 = bitcast <4 x i32> %19 to <4 x float>
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %do.body
 
   do.body:                                          ; preds = %do.body, %arm_mean_f32_mve.exit
     %blkCnt.0 = phi i32 [ %blockSize, %arm_mean_f32_mve.exit ], [ %26, %do.body ]
     %sumVec.0 = phi <4 x float> [ zeroinitializer, %arm_mean_f32_mve.exit ], [ %25, %do.body ]
     %pSrc.addr.0 = phi float* [ %pSrc, %arm_mean_f32_mve.exit ], [ %add.ptr, %do.body ]
-    %21 = phi i32 [ %4, %arm_mean_f32_mve.exit ], [ %27, %do.body ]
+    %21 = phi i32 [ %start2, %arm_mean_f32_mve.exit ], [ %27, %do.body ]
     %pSrc.addr.01 = bitcast float* %pSrc.addr.0 to <4 x float>*
     %22 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
     %23 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.01, i32 4, <4 x i1> %22, <4 x float> zeroinitializer)
@@ -87,7 +89,7 @@
   declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
 
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
 
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #3
@@ -152,32 +154,22 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
-  ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
-  ; CHECK:   tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
-  ; CHECK:   t2IT 10, 8, implicit-def $itstate
-  ; CHECK:   renamable $r3 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate
-  ; CHECK:   renamable $r12 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
   ; CHECK:   $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r3
   ; CHECK:   $r4 = tMOVr $lr, 14 /* CC::al */, $noreg
   ; CHECK: bb.1.do.body.i:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r12
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
-  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, killed renamable $q0
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.1
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r4, $r12
+  ; CHECK:   renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
+  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
   ; CHECK: bb.2.arm_mean_f32_mve.exit:
   ; CHECK:   successors: %bb.3(0x80000000)
   ; CHECK:   liveins: $q0, $r0, $r1, $r2, $r4
   ; CHECK:   $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
+  ; CHECK:   dead $lr = tMOVr $r4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
   ; CHECK:   $lr = t2DLS killed $r4
   ; CHECK:   renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
@@ -224,7 +216,7 @@ body:             |
     renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
     $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
     $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     $r4 = tMOVr $lr, 14 /* CC::al */, $noreg
 
   bb.1.do.body.i:
@@ -247,7 +239,7 @@ body:             |
     $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
     $lr = tMOVr $r4, 14 /* CC::al */, $noreg
     renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, renamable $s3, 14 /* CC::al */, $noreg, implicit $q0
-    t2DoLoopStart killed $r4
+    $lr = t2DoLoopStart killed $r4
     renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
     renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
     renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
index 51c7f34262838..8d247fdad805e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
@@ -14,13 +14,13 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
     %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
     %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
@@ -46,7 +46,7 @@
   }
   declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 ...
@@ -153,7 +153,7 @@ body:             |
     renamable $r5 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
     renamable $r12 = t2LSRri killed renamable $r3, 1, 14, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
-    t2DoLoopStart renamable $r5
+    $lr = t2DoLoopStart renamable $r5
     $lr = tMOVr killed $r5, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
index 1404075dce901..4aefd5a2ecb28 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
@@ -6,35 +6,31 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    mov r3, r1
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    cmp r1, #4
 ; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r3, #4
-; CHECK-NEXT:    mov.w r12, #1
-; CHECK-NEXT:    subs r3, r1, r3
+; CHECK-NEXT:    movge r4, #4
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    subs r4, r1, r4
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    adds r3, #3
-; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
+; CHECK-NEXT:    adds r4, #3
+; CHECK-NEXT:    add.w r12, r3, r4, lsr #2
 ; CHECK-NEXT:    mov r3, r1
-; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    mov r4, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:  .LBB0_1: @ %do.body.i
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q1, [r12], #16
-; CHECK-NEXT:    vaddt.f32 q0, q0, q1
-; CHECK-NEXT:    le lr, .LBB0_1
+; CHECK-NEXT:    vldrw.u32 q1, [r4], #16
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %arm_mean_f32_mve.exit
 ; CHECK-NEXT:    vmov s4, r1
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    vadd.f32 s0, s3, s3
 ; CHECK-NEXT:    mov r3, r1
 ; CHECK-NEXT:    vcvt.f32.u32 s4, s4
-; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    vdiv.f32 s0, s0, s4
-; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:  .LBB0_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -42,7 +38,7 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float
 ; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vpsttt
 ; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
-; CHECK-NEXT:    vsubt.f32 q1, q1, r12
+; CHECK-NEXT:    vsubt.f32 q1, q1, r4
 ; CHECK-NEXT:    vfmat.f32 q0, q1, q1
 ; CHECK-NEXT:    le lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %do.end

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
index ea3589f48fdb7..ff4252d28f05c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
@@ -18,13 +18,13 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
     %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
     %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
@@ -50,7 +50,7 @@
   }
   declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #1
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -169,7 +169,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
     $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
     renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
index 0295acb67962d..6e6ce97671a08 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
@@ -18,13 +18,13 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
     %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
     %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
@@ -50,7 +50,7 @@
   }
   declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #1
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -168,7 +168,7 @@ body:             |
     renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
     renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir
index 9cedc94ab98ae..49383ba618fe6 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir
@@ -36,17 +36,17 @@
     br i1 %26, label %49, label %31
 
   31:                                               ; preds = %23
-    call void @llvm.set.loop.iterations.i32(i32 %30)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %30)
     br label %65
 
   32:                                               ; preds = %11
-    call void @llvm.set.loop.iterations.i32(i32 %22)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %22)
     br label %33
 
   33:                                               ; preds = %33, %32
     %34 = phi i32* [ %46, %33 ], [ %0, %32 ]
     %35 = phi i32* [ %45, %33 ], [ %1, %32 ]
-    %36 = phi i32 [ %22, %32 ], [ %47, %33 ]
+    %36 = phi i32 [ %start2, %32 ], [ %47, %33 ]
     %37 = phi i32 [ %9, %32 ], [ %41, %33 ]
     %38 = bitcast i32* %34 to <4 x i32>*
     %39 = bitcast i32* %35 to <4 x i32>*
@@ -89,7 +89,7 @@
   65:                                               ; preds = %65, %31
     %66 = phi i32 [ %108, %65 ], [ 0, %31 ]
     %67 = phi i32 [ 0, %31 ], [ %107, %65 ]
-    %68 = phi i32 [ %30, %31 ], [ %109, %65 ]
+    %68 = phi i32 [ %start1, %31 ], [ %109, %65 ]
     %69 = bitcast i32* %0 to i8*
     %70 = bitcast i32* %1 to i8*
     %71 = getelementptr i8, i8* %70, i32 %66
@@ -141,7 +141,7 @@
 
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -353,7 +353,7 @@ body:             |
     renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
     $r2 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.3 (%ir-block.33):
     successors: %bb.3(0x7c000000), %bb.4(0x04000000)
@@ -402,7 +402,7 @@ body:             |
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14, $noreg, $noreg
     renamable $r2, dead $cpsr = tMOVi8 0, 14, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.8 (%ir-block.65):
     successors: %bb.8(0x7c000000), %bb.9(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir
index 588b62a22db8d..056aae9831ba6 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir
@@ -18,13 +18,13 @@
     br i1 %10, label %34, label %17
 
   17:                                               ; preds = %4
-    call void @llvm.set.loop.iterations.i32(i32 %16)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %16)
     br label %18
 
   18:                                               ; preds = %18, %17
     %19 = phi i32* [ %31, %18 ], [ %0, %17 ]
     %20 = phi i32* [ %30, %18 ], [ %1, %17 ]
-    %21 = phi i32 [ %16, %17 ], [ %32, %18 ]
+    %21 = phi i32 [ %start, %17 ], [ %32, %18 ]
     %22 = phi i32 [ %9, %17 ], [ %26, %18 ]
     %23 = bitcast i32* %19 to <4 x i32>*
     %24 = bitcast i32* %20 to <4 x i32>*
@@ -45,7 +45,7 @@
   }
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 
@@ -143,7 +143,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
     $r3 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2 (%ir-block.18):
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir
index e23f0bc6f4b82..075df0200eee8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir
@@ -8,7 +8,7 @@
     br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
 
   for.body.preheader:                               ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
     br label %for.body
 
   for.cond.cleanup:                                 ; preds = %for.end, %entry
@@ -18,7 +18,7 @@
     %lsr.iv4 = phi i32* [ %b, %for.body.preheader ], [ %scevgep5, %for.end ]
     %lsr.iv2 = phi i32* [ %c, %for.body.preheader ], [ %scevgep3, %for.end ]
     %lsr.iv1 = phi i32* [ %a, %for.body.preheader ], [ %scevgep, %for.end ]
-    %lsr.iv = phi i32 [ %N, %for.body.preheader ], [ %lsr.iv.next, %for.end ]
+    %lsr.iv = phi i32 [ %start, %for.body.preheader ], [ %lsr.iv.next, %for.end ]
     %size = call i32 @llvm.arm.space(i32 3072, i32 undef)
     %0 = load i32, i32* %lsr.iv4, align 4
     %1 = load i32, i32* %lsr.iv2, align 4
@@ -46,7 +46,7 @@
   declare i32 @llvm.arm.space(i32 immarg, i32) #0
 
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
 
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@@ -166,7 +166,7 @@ body:             |
     liveins: $r0, $r1, $r2, $r3, $r4, $lr
 
     $lr = tMOVr $r3, 14, $noreg
-    t2DoLoopStart killed $r3
+    $lr = t2DoLoopStart killed $r3
     tB %bb.2, 14, $noreg
 
   bb.2.for.end:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir
index 087db2ae509d0..17731f72200c3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir
@@ -14,14 +14,14 @@
     br i1 %cmp30, label %for.cond.cleanup6, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv68 = phi i32* [ %scevgep69, %vector.body ], [ %a, %vector.ph ]
     %lsr.iv65 = phi i32* [ %scevgep66, %vector.body ], [ %c, %vector.ph ]
     %lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %b, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start1, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv6870 = bitcast i32* %lsr.iv68 to <4 x i32>*
     %lsr.iv6567 = bitcast i32* %lsr.iv65 to <4 x i32>*
@@ -50,14 +50,14 @@
     br i1 %13, label %for.cond.cleanup6, label %vector.ph39
 
   vector.ph39:                                      ; preds = %for.cond4.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %19)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %19)
     br label %vector.body38
 
   vector.body38:                                    ; preds = %vector.body38, %vector.ph39
     %lsr.iv59 = phi i32* [ %scevgep60, %vector.body38 ], [ %a, %vector.ph39 ]
     %lsr.iv56 = phi i32* [ %scevgep57, %vector.body38 ], [ %c, %vector.ph39 ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body38 ], [ %b, %vector.ph39 ]
-    %20 = phi i32 [ %19, %vector.ph39 ], [ %26, %vector.body38 ]
+    %20 = phi i32 [ %start2, %vector.ph39 ], [ %26, %vector.body38 ]
     %21 = phi i32 [ %N, %vector.ph39 ], [ %23, %vector.body38 ]
     %lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>*
     %lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>*
@@ -94,14 +94,14 @@
     br i1 %cmp30, label %for.cond4.preheader, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv68 = phi i32* [ %scevgep69, %vector.body ], [ %a, %vector.ph ]
     %lsr.iv65 = phi i32* [ %scevgep66, %vector.body ], [ %c, %vector.ph ]
     %lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %b, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start1, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv6870 = bitcast i32* %lsr.iv68 to <4 x i32>*
     %lsr.iv6567 = bitcast i32* %lsr.iv65 to <4 x i32>*
@@ -130,14 +130,14 @@
     br i1 %cmp528, label %for.cond.cleanup6, label %vector.ph39
 
   vector.ph39:                                      ; preds = %for.cond4.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %18)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %18)
     br label %vector.body38
 
   vector.body38:                                    ; preds = %vector.body38, %vector.ph39
     %lsr.iv59 = phi i32* [ %scevgep60, %vector.body38 ], [ %a, %vector.ph39 ]
     %lsr.iv56 = phi i32* [ %scevgep57, %vector.body38 ], [ %c, %vector.ph39 ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body38 ], [ %b, %vector.ph39 ]
-    %19 = phi i32 [ %18, %vector.ph39 ], [ %25, %vector.body38 ]
+    %19 = phi i32 [ %start2, %vector.ph39 ], [ %25, %vector.body38 ]
     %20 = phi i32 [ %N, %vector.ph39 ], [ %22, %vector.body38 ]
     %lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>*
     %lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>*
@@ -173,14 +173,14 @@
     br i1 %cmp54, label %for.cond.cleanup17, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv123 = phi i32* [ %scevgep124, %vector.body ], [ %a, %vector.ph ]
     %lsr.iv120 = phi i32* [ %scevgep121, %vector.body ], [ %c, %vector.ph ]
     %lsr.iv117 = phi i32* [ %scevgep118, %vector.body ], [ %b, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start1, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv123125 = bitcast i32* %lsr.iv123 to <4 x i32>*
     %lsr.iv120122 = bitcast i32* %lsr.iv120 to <4 x i32>*
@@ -210,14 +210,14 @@
     br i1 %cmp552, label %for.cond15.preheader, label %vector.ph66
 
   vector.ph66:                                      ; preds = %for.cond4.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %18)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %18)
     br label %vector.body65
 
   vector.body65:                                    ; preds = %vector.body65, %vector.ph66
     %lsr.iv114 = phi i32* [ %scevgep115, %vector.body65 ], [ %a, %vector.ph66 ]
     %lsr.iv111 = phi i32* [ %scevgep112, %vector.body65 ], [ %c, %vector.ph66 ]
     %lsr.iv108 = phi i32* [ %scevgep109, %vector.body65 ], [ %b, %vector.ph66 ]
-    %19 = phi i32 [ %18, %vector.ph66 ], [ %25, %vector.body65 ]
+    %19 = phi i32 [ %start2, %vector.ph66 ], [ %25, %vector.body65 ]
     %20 = phi i32 [ %div, %vector.ph66 ], [ %22, %vector.body65 ]
     %lsr.iv114116 = bitcast i32* %lsr.iv114 to <4 x i32>*
     %lsr.iv111113 = bitcast i32* %lsr.iv111 to <4 x i32>*
@@ -248,14 +248,14 @@
     br i1 %27, label %for.cond.cleanup17, label %vector.ph85
 
   vector.ph85:                                      ; preds = %for.cond15.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %33)
+    %start3 = call i32 @llvm.start.loop.iterations.i32(i32 %33)
     br label %vector.body84
 
   vector.body84:                                    ; preds = %vector.body84, %vector.ph85
     %lsr.iv105 = phi i32* [ %scevgep106, %vector.body84 ], [ %a, %vector.ph85 ]
     %lsr.iv102 = phi i32* [ %scevgep103, %vector.body84 ], [ %c, %vector.ph85 ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body84 ], [ %b, %vector.ph85 ]
-    %34 = phi i32 [ %33, %vector.ph85 ], [ %40, %vector.body84 ]
+    %34 = phi i32 [ %start3, %vector.ph85 ], [ %40, %vector.body84 ]
     %35 = phi i32 [ %N, %vector.ph85 ], [ %37, %vector.body84 ]
     %lsr.iv105107 = bitcast i32* %lsr.iv105 to <4 x i32>*
     %lsr.iv102104 = bitcast i32* %lsr.iv102 to <4 x i32>*
@@ -280,7 +280,7 @@
   }
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 
@@ -431,7 +431,7 @@ body:             |
     $r4 = tMOVr $r3, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r6, renamable $r12, 19, 14, $noreg, $noreg
     $r6 = tMOVr $r1, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -462,7 +462,7 @@ body:             |
     renamable $r6, dead $cpsr = tMOVi8 1, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r6, killed renamable $r12, 19, 14, $noreg, $noreg
     $r12 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.5.vector.body38:
     successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@@ -637,7 +637,7 @@ body:             |
     renamable $r6, dead $cpsr = tSUBi8 killed renamable $r6, 4, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs renamable $r12, killed renamable $r6, 19, 14, $noreg, $noreg
     $r6 = tMOVr $r2, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -670,7 +670,7 @@ body:             |
     renamable $r6 = t2BICri killed renamable $r6, 3, 14, $noreg, $noreg
     renamable $r6, dead $cpsr = tSUBi8 killed renamable $r6, 4, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r6, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.5.vector.body38:
     successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@@ -878,7 +878,7 @@ body:             |
     $r4 = tMOVr $r3, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r6, renamable $r12, 19, 14, $noreg, $noreg
     $r6 = tMOVr $r1, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -919,7 +919,7 @@ body:             |
     $r4 = tMOVr $r1, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs renamable $r8, killed renamable $r6, 19, 14, $noreg, $noreg
     $r6 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.5.vector.body65:
     successors: %bb.5(0x7c000000), %bb.6(0x04000000)
@@ -952,7 +952,7 @@ body:             |
 
     renamable $lr = nuw nsw t2ADDrs killed renamable $r8, killed renamable $r12, 19, 14, $noreg, $noreg
     $r5 = tMOVr $r0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.8.vector.body84:
     successors: %bb.8(0x7c000000), %bb.9(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index 261222f60f17a..91cea20dfbeb1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -92,9 +92,9 @@ define arm_aapcs_vfpcc void @float_float_mul(float* nocapture readonly %a, float
 ; CHECK-NEXT:    sub.w r7, r12, #4
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB0_12: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
@@ -311,9 +311,9 @@ define arm_aapcs_vfpcc void @float_float_add(float* nocapture readonly %a, float
 ; CHECK-NEXT:    sub.w r7, r12, #4
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB1_12: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
@@ -530,9 +530,9 @@ define arm_aapcs_vfpcc void @float_float_sub(float* nocapture readonly %a, float
 ; CHECK-NEXT:    sub.w r7, r12, #4
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB2_12: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
@@ -680,9 +680,9 @@ define arm_aapcs_vfpcc void @float_int_mul(float* nocapture readonly %a, i32* no
 ; CHECK-NEXT:    sub.w r7, r12, #4
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB3_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
@@ -889,10 +889,10 @@ define arm_aapcs_vfpcc void @float_int_int_mul(i32* nocapture readonly %a, i32*
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r4], #16
@@ -906,11 +906,11 @@ define arm_aapcs_vfpcc void @float_int_int_mul(i32* nocapture readonly %a, i32*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB4_6: @ %for.body.preheader11
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB4_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r3, [r0], #4
@@ -994,10 +994,10 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr.w r9, [r4]
@@ -1021,11 +1021,11 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    cmp r12, r3
 ; CHECK-NEXT:    beq .LBB5_8
 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader11
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB5_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr.16 s0, [r1]
@@ -1111,10 +1111,10 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr.w r9, [r4]
@@ -1138,11 +1138,11 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    cmp r12, r3
 ; CHECK-NEXT:    beq .LBB6_8
 ; CHECK-NEXT:  .LBB6_6: @ %for.body.preheader11
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB6_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr.16 s0, [r1]
@@ -1228,10 +1228,10 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr.w r9, [r4]
@@ -1255,11 +1255,11 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    cmp r12, r3
 ; CHECK-NEXT:    beq .LBB7_8
 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader11
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB7_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr.16 s0, [r1]
@@ -1345,10 +1345,10 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB8_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q0, [r5], #8
@@ -1377,11 +1377,11 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no
 ; CHECK-NEXT:    cmp r12, r3
 ; CHECK-NEXT:    beq .LBB8_8
 ; CHECK-NEXT:  .LBB8_6: @ %for.body.preheader13
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB8_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh r3, [r1], #2
@@ -1476,9 +1476,9 @@ define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* n
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vldr s0, .LCPI9_0
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
+; CHECK-NEXT:    add.w r2, r3, r2, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:  .LBB9_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, r0, r3
@@ -1633,9 +1633,9 @@ define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* n
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vldr s0, .LCPI10_0
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
+; CHECK-NEXT:    add.w r2, r3, r2, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:  .LBB10_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, r0, r3
@@ -1790,10 +1790,10 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vldr s0, .LCPI11_0
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
+; CHECK-NEXT:    add.w r2, r3, r2, lsr #2
 ; CHECK-NEXT:    adds r3, r1, #4
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adds r2, r0, #4
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB11_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh.w r4, [r3, #2]

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 5c3af352782b0..96c89fb6545ec 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -15,9 +15,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@@ -91,9 +91,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@@ -167,9 +167,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@@ -243,9 +243,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@@ -319,9 +319,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@@ -430,10 +430,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    add.w r4, r3, #8
 ; CHECK-NEXT:    subs r5, #4
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
+; CHECK-NEXT:    add.w r6, r6, r5, lsr #2
 ; CHECK-NEXT:    adds r5, r0, #3
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    adds r6, r1, #1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r8, [r5, #-3]
@@ -624,8 +624,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@@ -732,10 +732,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    add.w r4, r3, #8
 ; CHECK-NEXT:    subs r5, #4
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
+; CHECK-NEXT:    add.w r6, r6, r5, lsr #2
 ; CHECK-NEXT:    adds r5, r0, #3
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    adds r6, r1, #1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r8, [r5, #-3]
@@ -926,8 +926,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@@ -1034,10 +1034,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
 ; CHECK-NEXT:    add.w r4, r3, #8
 ; CHECK-NEXT:    subs r5, #4
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
+; CHECK-NEXT:    add.w r6, r6, r5, lsr #2
 ; CHECK-NEXT:    add.w r5, r0, #8
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    add.w r6, r1, #8
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r8, [r5, #-8]
@@ -1214,8 +1214,8 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB10_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.16 lr, r3
+; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:  .LBB10_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #8

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
index c797e0401d4f2..c8d38032a6a4a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -12,47 +12,47 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[N_VEC]], -4
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TT:%.*]] = add i32 [[N_VEC]], -4
+; CHECK-NEXT:    [[TT1:%.*]] = lshr i32 [[TT]], 2
+; CHECK-NEXT:    [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[I_025_US:%.*]] = phi i32 [ [[INC10_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16*, i16** [[A:%.*]], i32 [[I_025_US]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TT3:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_025_US]]
 ; CHECK-NEXT:    [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX8_PROMOTED_US]], i32 0
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TT4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX8_PROMOTED_US]], i32 0
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TT2]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i16, i16* [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP11]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-NEXT:    [[TMP14]] = add nsw <4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TT8:%.*]] = bitcast i16* [[TT6]] to <4 x i16>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TT8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
+; CHECK-NEXT:    [[TT9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TT10:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TT11:%.*]] = bitcast i16* [[TT10]] to <4 x i16>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TT11]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
+; CHECK-NEXT:    [[TT12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32>
+; CHECK-NEXT:    [[TT13:%.*]] = mul nsw <4 x i32> [[TT12]], [[TT9]]
+; CHECK-NEXT:    [[TT14]] = add nsw <4 x i32> [[TT13]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP5]], i32 1)
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; CHECK-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
+; CHECK-NEXT:    [[TT15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TT5]], i32 1)
+; CHECK-NEXT:    [[TT16:%.*]] = icmp ne i32 [[TT15]], 0
+; CHECK-NEXT:    br i1 [[TT16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
-; CHECK-NEXT:    store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4
+; CHECK-NEXT:    [[TT17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TT14]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TT18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TT17]])
+; CHECK-NEXT:    store i32 [[TT18]], i32* [[ARRAYIDX8_US]], align 4
 ; CHECK-NEXT:    [[INC10_US]] = add nuw i32 [[I_025_US]], 1
 ; CHECK-NEXT:    [[EXITCOND27:%.*]] = icmp eq i32 [[INC10_US]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND27]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]]
@@ -69,51 +69,51 @@ for.cond1.preheader.us.preheader:                 ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer
-  %tmp = add i32 %n.vec, -4
-  %tmp1 = lshr i32 %tmp, 2
-  %tmp2 = add nuw nsw i32 %tmp1, 1
+  %tt = add i32 %n.vec, -4
+  %tt1 = lshr i32 %tt, 2
+  %tt2 = add nuw nsw i32 %tt1, 1
   br label %for.cond1.preheader.us
 
 for.cond1.preheader.us:                           ; preds = %middle.block, %for.cond1.preheader.us.preheader
   %i.025.us = phi i32 [ %inc10.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ]
   %arrayidx.us = getelementptr inbounds i16*, i16** %A, i32 %i.025.us
-  %tmp3 = load i16*, i16** %arrayidx.us, align 4
+  %tt3 = load i16*, i16** %arrayidx.us, align 4
   %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.025.us
   %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4
-  %tmp4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx8.promoted.us, i32 0
-  call void @llvm.set.loop.iterations.i32(i32 %tmp2)
+  %tt4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx8.promoted.us, i32 0
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tt2)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %for.cond1.preheader.us
   %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
-  %vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp14, %vector.body ]
-  %tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp15, %vector.body ]
+  %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt14, %vector.body ]
+  %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-  %tmp6 = getelementptr inbounds i16, i16* %tmp3, i32 %index
+  %tt6 = getelementptr inbounds i16, i16* %tt3, i32 %index
 
-  ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29
-  %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
+  ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat29
+  %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 
-  %tmp8 = bitcast i16* %tmp6 to <4 x i16>*
-  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp8, i32 2, <4 x i1> %tmp7, <4 x i16> undef)
-  %tmp9 = sext <4 x i16> %wide.masked.load to <4 x i32>
-  %tmp10 = getelementptr inbounds i16, i16* %B, i32 %index
-  %tmp11 = bitcast i16* %tmp10 to <4 x i16>*
-  %wide.masked.load30 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp11, i32 2, <4 x i1> %tmp7, <4 x i16> undef)
-  %tmp12 = sext <4 x i16> %wide.masked.load30 to <4 x i32>
-  %tmp13 = mul nsw <4 x i32> %tmp12, %tmp9
-  %tmp14 = add nsw <4 x i32> %tmp13, %vec.phi
+  %tt8 = bitcast i16* %tt6 to <4 x i16>*
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt8, i32 2, <4 x i1> %tt7, <4 x i16> undef)
+  %tt9 = sext <4 x i16> %wide.masked.load to <4 x i32>
+  %tt10 = getelementptr inbounds i16, i16* %B, i32 %index
+  %tt11 = bitcast i16* %tt10 to <4 x i16>*
+  %wide.masked.load30 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt11, i32 2, <4 x i1> %tt7, <4 x i16> undef)
+  %tt12 = sext <4 x i16> %wide.masked.load30 to <4 x i32>
+  %tt13 = mul nsw <4 x i32> %tt12, %tt9
+  %tt14 = add nsw <4 x i32> %tt13, %vec.phi
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp5, i32 1)
-  %tmp16 = icmp ne i32 %tmp15, 0
-  br i1 %tmp16, label %vector.body, label %middle.block
+  %tt15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tt5, i32 1)
+  %tt16 = icmp ne i32 %tt15, 0
+  br i1 %tt16, label %vector.body, label %middle.block
 
 middle.block:                                     ; preds = %vector.body
-  %tmp17 = select <4 x i1> %tmp7, <4 x i32> %tmp14, <4 x i32> %vec.phi
-  %tmp18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp17)
-  store i32 %tmp18, i32* %arrayidx8.us, align 4
+  %tt17 = select <4 x i1> %tt7, <4 x i32> %tt14, <4 x i32> %vec.phi
+  %tt18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tt17)
+  store i32 %tt18, i32* %arrayidx8.us, align 4
   %inc10.us = add nuw i32 %i.025.us, 1
   %exitcond27 = icmp eq i32 %inc10.us, %N
   br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us
@@ -133,45 +133,45 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[N_VEC]], -4
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TT:%.*]] = add i32 [[N_VEC]], -4
+; CHECK-NEXT:    [[TT1:%.*]] = lshr i32 [[TT]], 2
+; CHECK-NEXT:    [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[I_024_US:%.*]] = phi i32 [ [[INC9_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i32 [[I_024_US]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TT3:%.*]] = load i32*, i32** [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_024_US]]
 ; CHECK-NEXT:    [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX7_PROMOTED_US]], i32 0
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TT4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX7_PROMOTED_US]], i32 0
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TT2]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i32, i32* [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP12]] = add nsw <4 x i32> [[VEC_PHI]], [[TMP11]]
+; CHECK-NEXT:    [[TT8:%.*]] = bitcast i32* [[TT6]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TT8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[TT9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TT10:%.*]] = bitcast i32* [[TT9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TT10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[TT11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TT12]] = add nsw <4 x i32> [[VEC_PHI]], [[TT11]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP13]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP5]], i32 1)
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
+; CHECK-NEXT:    [[TT13]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TT5]], i32 1)
+; CHECK-NEXT:    [[TT14:%.*]] = icmp ne i32 [[TT13]], 0
+; CHECK-NEXT:    br i1 [[TT14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
-; CHECK-NEXT:    store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4
+; CHECK-NEXT:    [[TT15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TT12]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TT16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TT15]])
+; CHECK-NEXT:    store i32 [[TT16]], i32* [[ARRAYIDX7_US]], align 4
 ; CHECK-NEXT:    [[INC9_US]] = add nuw i32 [[I_024_US]], 1
 ; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i32 [[INC9_US]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND26]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]]
@@ -188,49 +188,49 @@ for.cond1.preheader.us.preheader:                 ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer
-  %tmp = add i32 %n.vec, -4
-  %tmp1 = lshr i32 %tmp, 2
-  %tmp2 = add nuw nsw i32 %tmp1, 1
+  %tt = add i32 %n.vec, -4
+  %tt1 = lshr i32 %tt, 2
+  %tt2 = add nuw nsw i32 %tt1, 1
   br label %for.cond1.preheader.us
 
 for.cond1.preheader.us:                           ; preds = %middle.block, %for.cond1.preheader.us.preheader
   %i.024.us = phi i32 [ %inc9.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ]
   %arrayidx.us = getelementptr inbounds i32*, i32** %A, i32 %i.024.us
-  %tmp3 = load i32*, i32** %arrayidx.us, align 4
+  %tt3 = load i32*, i32** %arrayidx.us, align 4
   %arrayidx7.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us
   %arrayidx7.promoted.us = load i32, i32* %arrayidx7.us, align 4
-  %tmp4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx7.promoted.us, i32 0
-  call void @llvm.set.loop.iterations.i32(i32 %tmp2)
+  %tt4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx7.promoted.us, i32 0
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tt2)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %for.cond1.preheader.us
   %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
-  %vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp12, %vector.body ]
-  %tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp13, %vector.body ]
+  %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt12, %vector.body ]
+  %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt13, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-  %tmp6 = getelementptr inbounds i32, i32* %tmp3, i32 %index
+  %tt6 = getelementptr inbounds i32, i32* %tt3, i32 %index
 
-  ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28
-  %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
+  ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat28
+  %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 
-  %tmp8 = bitcast i32* %tmp6 to <4 x i32>*
-  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp7, <4 x i32> undef)
-  %tmp9 = getelementptr inbounds i32, i32* %B, i32 %index
-  %tmp10 = bitcast i32* %tmp9 to <4 x i32>*
-  %wide.masked.load29 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp10, i32 4, <4 x i1> %tmp7, <4 x i32> undef)
-  %tmp11 = mul nsw <4 x i32> %wide.masked.load29, %wide.masked.load
-  %tmp12 = add nsw <4 x i32> %vec.phi, %tmp11
+  %tt8 = bitcast i32* %tt6 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt8, i32 4, <4 x i1> %tt7, <4 x i32> undef)
+  %tt9 = getelementptr inbounds i32, i32* %B, i32 %index
+  %tt10 = bitcast i32* %tt9 to <4 x i32>*
+  %wide.masked.load29 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt10, i32 4, <4 x i1> %tt7, <4 x i32> undef)
+  %tt11 = mul nsw <4 x i32> %wide.masked.load29, %wide.masked.load
+  %tt12 = add nsw <4 x i32> %vec.phi, %tt11
   %index.next = add i32 %index, 4
-  %tmp13 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp5, i32 1)
-  %tmp14 = icmp ne i32 %tmp13, 0
-  br i1 %tmp14, label %vector.body, label %middle.block
+  %tt13 = call i32 @llvm.loop.decrement.reg.i32(i32 %tt5, i32 1)
+  %tt14 = icmp ne i32 %tt13, 0
+  br i1 %tt14, label %vector.body, label %middle.block
 
 middle.block:                                     ; preds = %vector.body
-  %tmp15 = select <4 x i1> %tmp7, <4 x i32> %tmp12, <4 x i32> %vec.phi
-  %tmp16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp15)
-  store i32 %tmp16, i32* %arrayidx7.us, align 4
+  %tt15 = select <4 x i1> %tt7, <4 x i32> %tt12, <4 x i32> %vec.phi
+  %tt16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tt15)
+  store i32 %tt16, i32* %arrayidx7.us, align 4
   %inc9.us = add nuw i32 %i.024.us, 1
   %exitcond26 = icmp eq i32 %inc9.us, %N
   br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us
@@ -250,7 +250,7 @@ declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1
 
 ; Function Attrs: noduplicate nounwind
-declare void @llvm.set.loop.iterations.i32(i32) #2
+declare i32 @llvm.start.loop.iterations.i32(i32) #2
 
 ; Function Attrs: noduplicate nounwind
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #2

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
index 4a5f48331090e..950702dd8b439 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
@@ -13,11 +13,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
@@ -49,7 +49,7 @@
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -152,7 +152,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
index c27a6c32f5b31..9b3d7cd5b1a63 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
@@ -14,7 +14,7 @@
     br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     %6 = shl i32 %4, 3
     %7 = sub i32 %N, %6
     br label %vector.body
@@ -23,7 +23,7 @@
     %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %13, %vector.body ]
-    %8 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ]
+    %8 = phi i32 [ %start, %vector.ph ], [ %14, %vector.body ]
     %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ]
     %lsr.iv2022 = bitcast i8* %lsr.iv20 to <16 x i8>*
     %lsr.iv19 = bitcast i8* %lsr.iv to <16 x i8>*
@@ -54,7 +54,7 @@
 
   declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1
   declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4
 
@@ -180,7 +180,7 @@ body:             |
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r12, 35, 14, $noreg, $noreg
     renamable $r3 = t2LSRri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 34, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir
index 57d8014712a5b..7a6682f9d2da5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir
@@ -14,14 +14,14 @@
     br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv19 = phi i8* [ %scevgep20, %vector.body ], [ %res, %vector.ph ]
     %lsr.iv16 = phi i8* [ %scevgep17, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv1921 = bitcast i8* %lsr.iv19 to <16 x i8>*
     %lsr.iv1618 = bitcast i8* %lsr.iv16 to <16 x i8>*
@@ -45,7 +45,7 @@
 
   declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
   declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <16 x i1> @llvm.arm.mve.vctp8(i32)
 
@@ -155,7 +155,7 @@ body:             |
     renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r12, 16, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir
index 6411a808572e8..ef4c13ea8e888 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir
@@ -14,11 +14,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
@@ -39,7 +39,7 @@
   }
 
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
   declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
@@ -123,7 +123,7 @@ body:             |
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r1 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $r1 = tMOVr killed $r3, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
index c3655baeb8b1c..eb0b41f5dac28 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
@@ -14,14 +14,14 @@
     br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i8* %lsr.iv to <4 x i8>*
     %lsr.iv1416 = bitcast i8* %lsr.iv14 to <4 x i8>*
@@ -61,14 +61,14 @@
     br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv15 = phi i8* [ %scevgep16, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv14 = bitcast i8* %lsr.iv to <4 x i8>*
     %lsr.iv1517 = bitcast i8* %lsr.iv15 to <4 x i8>*
@@ -108,14 +108,14 @@
     br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i16* %lsr.iv to <4 x i16>*
     %lsr.iv1416 = bitcast i16* %lsr.iv14 to <4 x i16>*
@@ -155,14 +155,14 @@
     br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv15 = phi i16* [ %scevgep16, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv14 = bitcast i16* %lsr.iv to <4 x i16>*
     %lsr.iv1517 = bitcast i16* %lsr.iv15 to <4 x i16>*
@@ -203,14 +203,14 @@
     br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv13 = phi i32* [ %scevgep14, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv12 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv1315 = bitcast i32* %lsr.iv13 to <4 x i32>*
@@ -249,14 +249,14 @@
     br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@@ -286,7 +286,7 @@
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 
@@ -372,7 +372,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body (align 4):
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -478,7 +478,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body (align 4):
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -585,7 +585,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body (align 4):
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -691,7 +691,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body (align 4):
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -797,7 +797,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body (align 4):
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -903,7 +903,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body (align 4):
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index bf3a8b7866302..ff9106172d8dc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -69,26 +69,26 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
 ; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vadd.i16 q1, q0, q1
+; CHECK-NEXT:    vadd.i16 q0, q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, q2
+; CHECK-NEXT:    vadd.i16 q0, q0, q2
 ; CHECK-NEXT:    le lr, .LBB1_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@@ -142,25 +142,25 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocaptur
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    add.w r3, r2, #15
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #15
 ; CHECK-NEXT:    sub.w r12, r3, #16
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #4
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #4
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.8 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u8 q1, [r1], #16
+; CHECK-NEXT:    vldrbt.u8 q0, [r1], #16
 ; CHECK-NEXT:    vldrbt.u8 q2, [r0], #16
 ; CHECK-NEXT:    subs r2, #16
-; CHECK-NEXT:    vsub.i8 q1, q2, q1
-; CHECK-NEXT:    vadd.i8 q1, q1, q0
+; CHECK-NEXT:    vsub.i8 q0, q2, q0
+; CHECK-NEXT:    vadd.i8 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB2_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    uxtb r0, r0
@@ -212,25 +212,25 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocaptu
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
 ; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
 ; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vsub.i16 q1, q2, q1
-; CHECK-NEXT:    vadd.i16 q1, q1, q0
+; CHECK-NEXT:    vsub.i16 q0, q2, q0
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB3_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@@ -284,25 +284,25 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocaptur
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    add.w r3, r2, #15
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #15
 ; CHECK-NEXT:    sub.w r12, r3, #16
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #4
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #4
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.8 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u8 q1, [r0], #16
+; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
 ; CHECK-NEXT:    vldrbt.u8 q2, [r1], #16
 ; CHECK-NEXT:    subs r2, #16
-; CHECK-NEXT:    vmul.i8 q1, q2, q1
-; CHECK-NEXT:    vadd.i8 q1, q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q2, q0
+; CHECK-NEXT:    vadd.i8 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB4_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    uxtb r0, r0
@@ -354,25 +354,25 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocaptu
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #8
 ; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
 ; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vmul.i16 q1, q2, q1
-; CHECK-NEXT:    vadd.i16 q1, q1, q0
+; CHECK-NEXT:    vmul.i16 q0, q2, q0
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@@ -423,36 +423,36 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
 ; CHECK-NEXT:    beq .LBB6_8
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    subs r6, r3, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r6, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u32 q1, [r4], #4
+; CHECK-NEXT:    vldrbt.u32 q0, [r4], #4
 ; CHECK-NEXT:    vldrbt.u32 q2, [r5], #4
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vmul.i32 q1, q2, q1
-; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    vmul.i32 q0, q2, q0
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB6_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u32 r12, q0
 ; CHECK-NEXT:    cbz r2, .LBB6_7
 ; CHECK-NEXT:  @ %bb.4: @ %vector.ph47
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:    add.w r3, r3, r6, lsr #2
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    vdup.32 q0, r6
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    vmov.32 q0[0], r12
 ; CHECK-NEXT:  .LBB6_5: @ %vector.body46
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -550,32 +550,32 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur
 ; CHECK-NEXT:    cbz r2, .LBB7_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    bic r3, r3, #7
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r3, #8
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    add.w lr, r4, r3, lsr #3
-; CHECK-NEXT:    mov r3, r0
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    add.w r3, r4, r3, lsr #3
 ; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:  .LBB7_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q1, [r3], #8
+; CHECK-NEXT:    vldrbt.u16 q0, [r3], #8
 ; CHECK-NEXT:    vldrbt.u16 q4, [r4], #8
 ; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    vsub.i16 q3, q4, q1
-; CHECK-NEXT:    vmul.i16 q1, q4, q1
+; CHECK-NEXT:    vsub.i16 q3, q4, q0
+; CHECK-NEXT:    vmul.i16 q0, q4, q0
 ; CHECK-NEXT:    subs r2, #8
 ; CHECK-NEXT:    vadd.i16 q3, q3, q2
-; CHECK-NEXT:    vadd.i16 q1, q1, q0
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB7_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u16 r4, q2
 ; CHECK-NEXT:    vaddv.u16 r2, q0
 ; CHECK-NEXT:    b .LBB7_5
@@ -643,40 +643,40 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) {
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    mov r1, r0
 ; CHECK-NEXT:    movw r12, #47184
-; CHECK-NEXT:    movw r3, #23593
 ; CHECK-NEXT:    ldrd r2, lr, [r1, #4]
+; CHECK-NEXT:    movw r1, #23593
 ; CHECK-NEXT:    movt r12, #1310
-; CHECK-NEXT:    movt r3, #49807
-; CHECK-NEXT:    mla r3, lr, r3, r12
-; CHECK-NEXT:    movw r1, #55051
+; CHECK-NEXT:    movt r1, #49807
+; CHECK-NEXT:    mla r1, lr, r1, r12
+; CHECK-NEXT:    movw r3, #55051
 ; CHECK-NEXT:    movw r4, #23593
-; CHECK-NEXT:    movt r1, #163
+; CHECK-NEXT:    movt r3, #163
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    movt r4, #655
-; CHECK-NEXT:    ror.w r12, r3, #4
-; CHECK-NEXT:    cmp r12, r1
-; CHECK-NEXT:    cset r1, lo
-; CHECK-NEXT:    ror.w r3, r3, #2
+; CHECK-NEXT:    ror.w r12, r1, #4
+; CHECK-NEXT:    cmp r12, r3
+; CHECK-NEXT:    cset r3, lo
+; CHECK-NEXT:    ror.w r1, r1, #2
 ; CHECK-NEXT:    mov.w r12, #1
-; CHECK-NEXT:    cmp r3, r4
-; CHECK-NEXT:    csel r3, r1, r12, lo
+; CHECK-NEXT:    cmp r1, r4
+; CHECK-NEXT:    csel r1, r3, r12, lo
 ; CHECK-NEXT:    lsls.w r4, lr, #30
-; CHECK-NEXT:    csel r1, r1, r3, ne
+; CHECK-NEXT:    csel r3, r3, r1, ne
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
-; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    movs r4, #52
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
-; CHECK-NEXT:    movw r3, :lower16:days
-; CHECK-NEXT:    movt r3, :upper16:days
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    mla r1, r1, r4, r3
+; CHECK-NEXT:    adds r1, r2, #3
+; CHECK-NEXT:    bic r1, r1, #3
+; CHECK-NEXT:    subs r1, #4
+; CHECK-NEXT:    add.w r4, r12, r1, lsr #2
+; CHECK-NEXT:    movw r12, :lower16:days
+; CHECK-NEXT:    movt r12, :upper16:days
+; CHECK-NEXT:    movs r1, #52
+; CHECK-NEXT:    mla r1, r3, r1, r12
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
index f334a5950acbe..4915dda8191a9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
@@ -105,8 +105,8 @@ define void @dont_remat_predicated_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32*
 ; CHECK-NEXT:    vmov.i32 q2, #0x1
 ; CHECK-NEXT:    add.w lr, r5, #3
 ; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    add.w lr, r5, lr, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r5, r5, lr, lsr #2
+; CHECK-NEXT:    dls lr, r5
 ; CHECK-NEXT:  .LBB1_1: @ %bb6
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r12

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir
index fa799111fe5ff..42233f3fc4129 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir
@@ -31,13 +31,13 @@
     %ind.end17 = getelementptr float, float* %pDst, i32 %n.vec
     %scevgep9 = getelementptr float, float* %pDst, i32 -4
     %scevgep14 = getelementptr float, float* %pSrc, i32 -4
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv15 = phi float* [ %scevgep16, %vector.body ], [ %scevgep14, %vector.ph ]
     %lsr.iv10 = phi float* [ %scevgep11, %vector.body ], [ %scevgep9, %vector.ph ]
-    %5 = phi i32 [ %4, %vector.ph ], [ %7, %vector.body ]
+    %5 = phi i32 [ %start1, %vector.ph ], [ %7, %vector.body ]
     %lsr.iv1517 = bitcast float* %lsr.iv15 to <4 x float>*
     %lsr.iv1012 = bitcast float* %lsr.iv10 to <4 x float>*
     %scevgep18 = getelementptr <4 x float>, <4 x float>* %lsr.iv1517, i32 1
@@ -61,13 +61,13 @@
     %pDst.addr.06.ph = phi float* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end17, %middle.block ]
     %scevgep1 = getelementptr float, float* %pSrc.addr.07.ph, i32 -1
     %scevgep4 = getelementptr float, float* %pDst.addr.06.ph, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %blkCnt.08.ph)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %blkCnt.08.ph)
     br label %while.body
 
   while.body:                                       ; preds = %while.body, %while.body.preheader19
     %lsr.iv5 = phi float* [ %scevgep6, %while.body ], [ %scevgep4, %while.body.preheader19 ]
     %lsr.iv = phi float* [ %scevgep2, %while.body ], [ %scevgep1, %while.body.preheader19 ]
-    %9 = phi i32 [ %blkCnt.08.ph, %while.body.preheader19 ], [ %12, %while.body ]
+    %9 = phi i32 [ %start2, %while.body.preheader19 ], [ %12, %while.body ]
     %scevgep3 = getelementptr float, float* %lsr.iv, i32 1
     %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1
     %10 = load float, float* %scevgep3, align 4
@@ -84,7 +84,7 @@
   }
   declare float @llvm.fabs.f32(float)
   declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 
 ...
@@ -262,7 +262,7 @@ body:             |
     renamable $r7, dead $cpsr = tSUBrr renamable $r2, renamable $r4, 14, $noreg
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
     renamable $r12 = t2ADDrs renamable $r0, renamable $r4, 18, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 16, 14, $noreg
     $r5 = tMOVr killed $r3, 14, $noreg
     renamable $r3 = t2ADDrs renamable $r1, renamable $r4, 18, 14, $noreg, $noreg
@@ -305,7 +305,7 @@ body:             |
 
     renamable $r0, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg
     renamable $r1 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.8.while.body:
     successors: %bb.8(0x7c000000), %bb.9(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir
index e0434c4e05039..81514a02577e8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir
@@ -14,12 +14,12 @@
     br i1 %cmp6, label %while.end, label %while.body.preheader
   
   while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
     br label %while.body
   
   while.body:                                       ; preds = %while.body, %while.body.preheader
     %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
-    %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ]
+    %0 = phi i32 [ %start, %while.body.preheader ], [ %1, %while.body ]
     %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)()
     %add = add nsw i32 %call, %res.07
     %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
@@ -33,7 +33,7 @@
   
   declare i32 @bar(...) local_unnamed_addr #0
   
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
   
   attributes #0 = { "target-features"="+mve.fp" }
@@ -109,7 +109,7 @@ body:             |
   
     $lr = tMOVr $r0, 14, $noreg
     renamable $r4, dead $cpsr = tMOVi8 0, 14, $noreg
-    t2DoLoopStart killed $r0
+    $lr = t2DoLoopStart killed $r0
   
   bb.2.while.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir
index fc2705797404e..cd590c98894e9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir
@@ -14,11 +14,11 @@
     br i1 %cmp6, label %while.end, label %while.body.preheader
   
   while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
     br label %while.body
   
   while.body:                                       ; preds = %while.body, %while.body.preheader
-    %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ]
+    %0 = phi i32 [ %start, %while.body.preheader ], [ %1, %while.body ]
     %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
     %add = add i32 %1, 0
     %2 = icmp ne i32 %1, 0
@@ -29,7 +29,7 @@
     ret i32 %res.0.lcssa
   }
   
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
   
   attributes #0 = { "target-features"="+mve.fp" }
@@ -96,7 +96,7 @@ body:             |
     liveins: $r0
   
     $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart killed $r0
+    $lr = t2DoLoopStart killed $r0
   
   bb.2.while.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir
index ca3a1122d982c..d74d77f3158d2 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir
@@ -14,11 +14,11 @@
     br i1 %cmp6, label %while.end, label %while.body.preheader
   
   while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
     br label %while.body
   
   while.body:                                       ; preds = %while.body, %while.body.preheader
-    %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ]
+    %0 = phi i32 [ %start, %while.body.preheader ], [ %1, %while.body ]
     %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
     %add = add i32 %1, 2
     %2 = icmp ne i32 %1, 0
@@ -30,7 +30,7 @@
   }
   
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
   
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@@ -102,7 +102,7 @@ body:             |
     liveins: $r0
   
     $lr = tMOVr $r0, 14, $noreg
-    t2DoLoopStart killed $r0
+    $lr = t2DoLoopStart killed $r0
   
   bb.2.while.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir
index c68e547ba6288..d1a4f421474d9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir
@@ -30,7 +30,7 @@
     %gap.057 = sdiv i32 %gap.057.in, 2
     %cmp252 = icmp slt i32 %gap.057, %n
     %tmp = sub i32 %n, %gap.057
-    call void @llvm.set.loop.iterations.i32(i32 %tmp)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp)
     br i1 %cmp252, label %for.cond4.preheader.preheader, label %for.cond.loopexit
   
   for.cond4.preheader.preheader:                    ; preds = %for.cond1.preheader
@@ -44,7 +44,7 @@
     %lsr.iv2 = phi i32* [ %scevgep3, %for.inc16 ], [ %scevgep1, %for.cond4.preheader.preheader ]
     %lsr.iv = phi i32* [ %v, %for.cond4.preheader.preheader ], [ %scevgep, %for.inc16 ]
     %i.053 = phi i32 [ %inc, %for.inc16 ], [ %gap.057, %for.cond4.preheader.preheader ]
-    %tmp8 = phi i32 [ %tmp, %for.cond4.preheader.preheader ], [ %tmp16, %for.inc16 ]
+    %tmp8 = phi i32 [ %start, %for.cond4.preheader.preheader ], [ %tmp16, %for.inc16 ]
     %j.048 = sub nsw i32 %i.053, %gap.057
     %cmp549 = icmp sgt i32 %j.048, -1
     br i1 %cmp549, label %land.rhs.preheader, label %for.inc16
@@ -93,7 +93,7 @@
   }
   
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
   
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
@@ -208,7 +208,7 @@ body:             |
     renamable $lr = t2SUBrs renamable $r1, renamable $r2, 9, 14, $noreg, $noreg
     renamable $r9 = t2ASRri renamable $r2, 1, 14, $noreg, $noreg
     t2CMPrs renamable $r1, killed renamable $r2, 9, 14, $noreg, implicit-def $cpsr
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
     tBcc %bb.2, 13, killed $cpsr
   
   bb.4.for.cond4.preheader.preheader:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir
index 20baa86fa1bf1..e596da82915bc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir
@@ -11,7 +11,7 @@
   entry:
     %scevgep = getelementptr i32, i32* %q, i32 -1
     %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
     br label %preheader
 
   preheader:
@@ -20,7 +20,7 @@
   while.body:                                       ; preds = %while.body, %entry
     %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ]
     %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ]
-    %0 = phi i32 [ %n, %preheader ], [ %2, %while.body ]
+    %0 = phi i32 [ %start, %preheader ], [ %2, %while.body ]
     %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
     %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
     %1 = load i32, i32* %scevgep6, align 4
@@ -35,7 +35,7 @@
     ret i32 0
   }
 
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
 
   attributes #0 = { noduplicate nounwind }
@@ -120,7 +120,7 @@ body:             |
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
-    t2DoLoopStart $r0
+    $lr = t2DoLoopStart $r0
     renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
     renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir
index a5a83982fcc54..7cc240353e87a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir
@@ -8,11 +8,11 @@
     br i1 %cmp, label %exit, label %loop.ph
 
   loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
     br label %loop.body
 
   loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
     %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
     %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
     %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@@ -43,11 +43,11 @@
     br i1 %cmp, label %exit, label %loop.ph
 
   loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
     br label %loop.body
 
   loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
     %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
     %addr.a = phi <8 x i16>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
     %addr.b = phi <8 x i16>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@@ -72,7 +72,7 @@
     ret void
   }
 
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@@ -160,7 +160,7 @@ body:             |
     liveins: $r0, $r1, $r2, $r3, $r4, $lr
 
     renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
+    $lr = t2DoLoopStart renamable $r4
     $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
 
   bb.2.loop.body:
@@ -261,7 +261,7 @@ body:             |
     liveins: $r0, $r1, $r2, $r3, $r4, $lr
 
     renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.loop.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
index 69d370fc01a2f..c4c64966620bb 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
@@ -15,29 +15,29 @@ define arm_aapcs_vfpcc void @test(i16* noalias nocapture readonly %off, i16* noa
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB0_5 Depth 2
-; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:  .LBB0_3: @ %for.body4.us
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh.w r6, [r0, r5, lsl #1]
-; CHECK-NEXT:    ldrh.w r7, [r1, r5, lsl #1]
-; CHECK-NEXT:    add r6, r7
-; CHECK-NEXT:    strh.w r6, [r4, r5, lsl #1]
-; CHECK-NEXT:    adds r5, #1
+; CHECK-NEXT:    ldrh.w r5, [r0, r6, lsl #1]
+; CHECK-NEXT:    ldrh.w r7, [r1, r6, lsl #1]
+; CHECK-NEXT:    add r5, r7
+; CHECK-NEXT:    strh.w r5, [r4, r6, lsl #1]
+; CHECK-NEXT:    adds r6, #1
 ; CHECK-NEXT:    le lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %for.body15.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:  .LBB0_5: @ %for.body15.us
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh.w r7, [r0, r5, lsl #1]
-; CHECK-NEXT:    ldrh.w r6, [r1, r5, lsl #1]
-; CHECK-NEXT:    add r6, r7
-; CHECK-NEXT:    strh.w r6, [r2, r5, lsl #1]
-; CHECK-NEXT:    adds r5, #1
+; CHECK-NEXT:    ldrh.w r7, [r0, r6, lsl #1]
+; CHECK-NEXT:    ldrh.w r5, [r1, r6, lsl #1]
+; CHECK-NEXT:    add r5, r7
+; CHECK-NEXT:    strh.w r5, [r2, r6, lsl #1]
+; CHECK-NEXT:    adds r6, #1
 ; CHECK-NEXT:    le lr, .LBB0_5
 ; CHECK-NEXT:  @ %bb.6: @ %for.cond.cleanup14.us
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir
index 5b74bd7352803..21efcc1bf2dd8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir
@@ -16,7 +16,7 @@
     %scevgep = getelementptr i32, i32* %a, i32 -1
     %scevgep4 = getelementptr i32, i32* %c, i32 -1
     %scevgep8 = getelementptr i32, i32* %b, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
     br label %for.body
 
   for.cond.cleanup:                                 ; preds = %for.body, %entry
@@ -26,7 +26,7 @@
     %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
     %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
     %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
-    %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.body ]
+    %0 = phi i32 [ %start, %for.body.preheader ], [ %3, %for.body ]
     %size = call i32 @llvm.arm.space(i32 4070, i32 undef)
     %scevgep3 = getelementptr i32, i32* %lsr.iv9, i32 1
     %1 = load i32, i32* %scevgep3, align 4
@@ -47,7 +47,7 @@
   declare i32 @llvm.arm.space(i32 immarg, i32) #0
 
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
 
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
@@ -155,7 +155,7 @@ body:             |
     renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
     renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg
     $lr = tMOVr $r3, 14, $noreg
-    t2DoLoopStart killed $r3
+    $lr = t2DoLoopStart killed $r3
 
   bb.2.for.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
index 497f0412589b2..5b66b258b55ef 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
@@ -20,7 +20,7 @@
 
   vector.ph:                                        ; preds = %entry
     %7 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %0, i32 0, !dbg !32
-    call void @llvm.set.loop.iterations.i32(i32 %6), !dbg !32
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %6), !dbg !32
     %8 = shl i32 %5, 2, !dbg !32
     %9 = sub i32 %N, %8, !dbg !32
     br label %vector.body, !dbg !32
@@ -28,7 +28,7 @@
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %b, %vector.ph ], !dbg !33
     %vec.phi = phi <4 x i32> [ %7, %vector.ph ], [ %15, %vector.body ]
-    %10 = phi i32 [ %6, %vector.ph ], [ %16, %vector.body ]
+    %10 = phi i32 [ %start, %vector.ph ], [ %16, %vector.body ]
     %11 = phi i32 [ %N, %vector.ph ], [ %13, %vector.body ]
     %lsr.iv14 = bitcast i16* %lsr.iv to <4 x i16>*
     %12 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %11), !dbg !34
@@ -59,7 +59,7 @@
   declare void @llvm.dbg.value(metadata, metadata, metadata)
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 
@@ -261,7 +261,7 @@ body:             |
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, renamable $r3, 19, 14, $noreg, $noreg, debug-location !32
     renamable $r3, dead $cpsr = tLSRri killed renamable $r3, 2, 14, $noreg, debug-location !32
     renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 18, 14, $noreg, $noreg, debug-location !32
-    t2DoLoopStart renamable $lr, debug-location !32
+    $lr = t2DoLoopStart renamable $lr, debug-location !32
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir
index a97da0c0f9e5e..f30ebf459bd60 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir
@@ -17,7 +17,7 @@
     br i1 %cmp11, label %for.cond.cleanup, label %for.body.preheader
   
   for.body.preheader:                               ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %N)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
     br label %for.body
   
   for.cond.cleanup:                                 ; preds = %for.inc, %entry
@@ -30,7 +30,7 @@
     %lsr.iv1 = phi i8* [ %c, %for.body.preheader ], [ %scevgep, %for.inc ]
     %spaces.013 = phi i32 [ %spaces.1, %for.inc ], [ 0, %for.body.preheader ]
     %found.012 = phi i32 [ %found.1, %for.inc ], [ 0, %for.body.preheader ]
-    %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.inc ]
+    %0 = phi i32 [ %start, %for.body.preheader ], [ %3, %for.inc ]
     %1 = load i8, i8* %lsr.iv1, align 1
     %2 = zext i8 %1 to i32
     switch i32 %2, label %for.inc [
@@ -58,7 +58,7 @@
   }
   
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
   
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
@@ -130,7 +130,7 @@ body:             |
     liveins: $r0, $r1
   
     $lr = tMOVr $r1, 14, $noreg
-    t2DoLoopStart killed $r1
+    $lr = t2DoLoopStart killed $r1
     renamable $r1, dead $cpsr = tMOVi8 0, 14, $noreg
     renamable $r12 = t2MOVi 1, 14, $noreg, $noreg
     renamable $r2, dead $cpsr = tMOVi8 0, 14, $noreg

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
index 51300a959f5a1..1492a01a272ee 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
@@ -25,12 +25,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
   %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -82,12 +82,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
   %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -138,12 +138,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -193,12 +193,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -252,12 +252,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -311,12 +311,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -374,7 +374,7 @@ vector.ph:
   %scevgep = getelementptr i32, i32* %A, i32 8
   %scevgep30 = getelementptr i32, i32* %C, i32 8
   %scevgep37 = getelementptr i32, i32* %B, i32 8
-  call void @llvm.set.loop.iterations.i32(i32 %v5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5)
   br label %vector.body
 
 vector.body:
@@ -382,7 +382,7 @@ vector.body:
   %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
   %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
-  %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ]
+  %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ]
   %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
   %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
   %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -447,7 +447,7 @@ entry:
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
 vector.ph:
-  call void @llvm.set.loop.iterations.i32(i32 %5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -455,7 +455,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
@@ -496,7 +496,7 @@ entry:
 
 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
-  call void @llvm.set.loop.iterations.i32(i32 %5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -504,7 +504,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
 
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@@ -547,7 +547,7 @@ entry:
 
 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
-  call void @llvm.set.loop.iterations.i32(i32 %5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -558,7 +558,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 ; AddRec base is not 0:
   %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ]
 
-  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
@@ -589,7 +589,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i
 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)
 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
index 58f3a94b061f8..4682f1d36f31d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -4,14 +4,14 @@
 define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 8001)
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
@@ -36,7 +36,7 @@ define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture rea
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
   br label %vector.body
 
 vector.body:
@@ -44,7 +44,7 @@ vector.body:
   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -77,13 +77,13 @@ for.cond.cleanup:
 define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 2000)
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 2000, [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
@@ -101,14 +101,14 @@ define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture re
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 2000)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 2000)
   br label %vector.body
 
 vector.body:
   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
-  %0 = phi i32 [ 2000, %entry ], [ %2, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ]
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -131,14 +131,14 @@ for.cond.cleanup:
 define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 8001)
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
@@ -161,7 +161,7 @@ define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture re
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
   br label %vector.body
 
 vector.body:
@@ -169,7 +169,7 @@ vector.body:
   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -199,14 +199,14 @@ for.cond.cleanup:
 define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo5(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 8001)
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
@@ -229,7 +229,7 @@ define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture re
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
   br label %vector.body
 
 vector.body:
@@ -237,7 +237,7 @@ vector.body:
   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -273,7 +273,7 @@ for.cond.cleanup:
 ;
 define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
   br label %vector.body
 
 vector.body:
@@ -281,7 +281,7 @@ vector.body:
   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -316,7 +316,7 @@ for.cond.cleanup:
 ;
 define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 1073741824)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824)
   br label %vector.body
 
 vector.body:
@@ -324,7 +324,7 @@ vector.body:
   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -359,7 +359,7 @@ for.cond.cleanup:
 ;
 define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
   br label %vector.body
 
 vector.body:
@@ -367,7 +367,7 @@ vector.body:
   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -402,7 +402,7 @@ for.cond.cleanup:
 ;
 define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
   br label %vector.body
 
 vector.body:
@@ -410,7 +410,7 @@ vector.body:
   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -448,7 +448,7 @@ for.cond.cleanup:
 ;
 define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
   br label %vector.body
 
 vector.body:
@@ -456,7 +456,7 @@ vector.body:
   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
+  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -502,7 +502,7 @@ vector.ph:                                        ; preds = %vector.ph.preheader
   %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ]
   %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ]
   %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ]
-  call void @llvm.set.loop.iterations.i32(i32 1025)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 1025)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -510,7 +510,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ]
   %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 1025, %vector.ph ], [ %2, %vector.body ]
+  %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ]
   %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
   %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>*
   %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>*
@@ -546,5 +546,5 @@ for.cond.cleanup3:                                ; preds = %vector.body
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
 declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 )
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
index d7f85315d7c5e..1d6ce3d8a899b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
@@ -83,7 +83,7 @@ entry:
 
 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
-  call void @llvm.set.loop.iterations.i32(i32 %5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -91,7 +91,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
 
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@@ -118,6 +118,6 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
index 3c6dd9c9f7d1b..c1ebfce840890 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
@@ -246,11 +246,11 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA,
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    adr r3, .LCPI5_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vadd.i32 q2, q0, r12

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll
index 52cd8fdc6d798..956bf9207e188 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll
@@ -18,12 +18,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -50,5 +50,5 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32 immarg, <4 x i1>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll
index 45c7b8f3e6239..9c4a7ed350843 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll
@@ -20,12 +20,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -65,12 +65,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -110,12 +110,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -155,12 +155,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -200,12 +200,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.2 = add i32 %N, -2
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -245,12 +245,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -289,12 +289,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %incorrect = add i32 %index, 1
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -335,12 +335,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -380,12 +380,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
@@ -425,12 +425,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, %offsets
@@ -470,12 +470,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -501,6 +501,6 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-declare void @llvm.set.loop.iterations.i32(i32) #3
+declare i32 @llvm.start.loop.iterations.i32(i32) #3
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
index d786209ad3fb0..ef79f27ce5dcb 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@@ -23,13 +23,13 @@ vector.ph:
   %0 = add i32 %n.vec, -8
   %1 = lshr i32 %0, 3
   %2 = add i32 %1, 1
-  call void @llvm.set.loop.iterations.i32(i32 %2)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %2)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ]
-  %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ]
+  %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
   %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -94,13 +94,13 @@ vector.ph:
   %0 = add i32 %n.vec, -8
   %1 = lshr i32 %0, 3
   %2 = add nuw nsw i32 %1, 1
-  call void @llvm.set.loop.iterations.i32(i32 %2)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %2)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ]
-  %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ]
+  %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
   %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -158,13 +158,13 @@ entry:
   %0 = add i32 %n.vec, -8
   %1 = lshr i32 %0, 3
   %2 = add nuw nsw i32 %1, 1
-  call void @llvm.set.loop.iterations.i32(i32 %2)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %2)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ]
-  %3 = phi i32 [ %2, %entry ], [ %4, %vector.body ]
+  %3 = phi i32 [ %start, %entry ], [ %4, %vector.body ]
   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
   %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -228,7 +228,7 @@ for.body:
 
 vector.ph:                                        ; preds = %for.body
   %trip.count.minus.1 = add i32 %8, -1
-  call void @llvm.set.loop.iterations.i32(i32 %7)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %7)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -236,7 +236,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %16, %vector.body ]
-  %9 = phi i32 [ %7, %vector.ph ], [ %17, %vector.body ]
+  %9 = phi i32 [ %start, %vector.ph ], [ %17, %vector.body ]
   %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>*
   %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>*
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %8)
@@ -278,7 +278,7 @@ for.end17:                                        ; preds = %for.end, %entry
 }
 
 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
index 5d81c4c07eeaf..939d3cc5e5582 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -17,12 +17,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
   %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -76,13 +76,13 @@ vector.ph:                                        ; preds = %entry
   %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   %broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
   %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -139,12 +139,12 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
@@ -178,7 +178,7 @@ declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg,
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32 immarg, <4 x i1>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
index 6d140589287d8..d4bb6b757ec67 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
@@ -7,14 +7,14 @@ define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    movs r2, #3
 ; CHECK-NEXT:    adr r3, .LCPI0_0
-; CHECK-NEXT:    mov.w lr, #3
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    vldrw.u32 q2, [r3]
 ; CHECK-NEXT:    vmov.i32 q0, #0x80000000
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vmov.i32 q3, #0xa
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vadd.i32 q4, q2, r2

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
index dec54006c7d96..cd2d311a5eacc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
@@ -14,11 +14,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %lsr.iv.2 = phi i16* [ %scevgep.2, %vector.body ], [ %c, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -40,7 +40,7 @@
   }
 
   declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)
   declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
@@ -132,7 +132,7 @@ body:             |
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     $r12 = t2MOVi16 32768, 14 /* CC::al */, $noreg
     $r12 = t2MOVTi16 killed $r12, 65535, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $r5 = tMOVr killed $r3, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
index 440080e4e142d..0b82b98bf7304 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
@@ -57,9 +57,9 @@ define i32 @bad(i32* readonly %x, i32* nocapture readonly %y, i32 %n) {
 ; CHECK-NEXT:    subs r3, r2, r3
 ; CHECK-NEXT:    add.w r12, r3, #3
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB1_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir
index 3a9aa031e25cd..17c66e9169e03 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir
@@ -37,18 +37,18 @@
     br i1 %7, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
 
   for.body.preheader.new:                           ; preds = %for.body.preheader
-    call void @llvm.set.loop.iterations.i32(i32 %11)
+    %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %11)
     br label %for.body
 
   vector.ph:                                        ; preds = %vector.memcheck
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv50 = phi i8* [ %scevgep51, %vector.body ], [ %res, %vector.ph ]
     %lsr.iv47 = phi i8* [ %scevgep48, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep45, %vector.body ], [ %a, %vector.ph ]
-    %12 = phi i32 [ %5, %vector.ph ], [ %17, %vector.body ]
+    %12 = phi i32 [ %start2, %vector.ph ], [ %17, %vector.body ]
     %13 = phi i32 [ %N, %vector.ph ], [ %15, %vector.body ]
     %lsr.iv5052 = bitcast i8* %lsr.iv50 to <16 x i8>*
     %lsr.iv4749 = bitcast i8* %lsr.iv47 to <16 x i8>*
@@ -88,7 +88,7 @@
 
   for.body:                                         ; preds = %for.body, %for.body.preheader.new
     %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
-    %21 = phi i32 [ %11, %for.body.preheader.new ], [ %30, %for.body ]
+    %21 = phi i32 [ %start1, %for.body.preheader.new ], [ %30, %for.body ]
     %scevgep23 = getelementptr i8, i8* %a, i32 %i.011
     %scevgep2453 = bitcast i8* %scevgep23 to i8*
     %22 = load i8, i8* %scevgep2453, align 1
@@ -159,7 +159,7 @@
 
   declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1
   declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4
 
@@ -429,7 +429,7 @@ body:             |
     renamable $r6 = t2BICri killed renamable $r6, 15, 14, $noreg, $noreg
     renamable $r6, dead $cpsr = tSUBi8 killed renamable $r6, 16, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r6, 35, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.5.vector.body:
     successors: %bb.5(0x7c000000), %bb.11(0x04000000)
@@ -455,7 +455,7 @@ body:             |
     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r3, 19, 14, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.7.for.body:
     successors: %bb.7(0x7c000000), %bb.8(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir
index db8f969ef7266..11197b5514be3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir
@@ -7,14 +7,14 @@
   entry:
     %scevgep = getelementptr i32, i32* %q, i32 -1
     %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
     %limit = lshr i32 %n, 1
     br label %while.body
 
   while.body:                                       ; preds = %while.body, %entry
     %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
     %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
-    %tmp = phi i32 [ %n, %entry ], [ %tmp2, %while.body ]
+    %tmp = phi i32 [ %start, %entry ], [ %tmp2, %while.body ]
     %scevgep7 = getelementptr i32, i32* %lsr.iv, i32 1
     %scevgep4 = getelementptr i32, i32* %lsr.iv4, i32 1
     %tmp1 = load i32, i32* %scevgep7, align 4
@@ -33,7 +33,7 @@
   }
 
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
 
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
@@ -130,7 +130,7 @@ body:             |
     renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
     renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
     renamable $r2 = t2LSRri renamable $lr, 1, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.1.while.body:
     successors: %bb.1(0x7c000000), %bb.2(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
index 475120a1ab612..f3589590d12bd 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
@@ -7,14 +7,14 @@
   entry:
     %scevgep = getelementptr i32, i32* %q, i32 -1
     %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
     %limit = lshr i32 %n, 1
     br label %while.body
 
   while.body:                                       ; preds = %while.body, %entry
     %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
     %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
-    %tmp = phi i32 [ %n, %entry ], [ %tmp2, %while.body ]
+    %tmp = phi i32 [ %start, %entry ], [ %tmp2, %while.body ]
     %scevgep7 = getelementptr i32, i32* %lsr.iv, i32 1
     %scevgep4 = getelementptr i32, i32* %lsr.iv4, i32 1
     %tmp1 = load i32, i32* %scevgep7, align 4
@@ -33,7 +33,7 @@
   }
 
   ; Function Attrs: noduplicate nounwind
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
 
   ; Function Attrs: noduplicate nounwind
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
@@ -129,7 +129,7 @@ body:             |
     frame-setup CFI_INSTRUCTION offset $r7, -8
     renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
     renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
-    t2DoLoopStart renamable $r0
+    $lr = t2DoLoopStart renamable $r0
     renamable $r2 = t2LSRri renamable $r0, 1, 14, $noreg, $noreg
     $lr = tMOVr $r0, 14, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-liveout.mir
deleted file mode 100644
index f39af6fb090cc..0000000000000
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-liveout.mir
+++ /dev/null
@@ -1,122 +0,0 @@
-# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
-# CHECK-NOT: $lr = t2DLS
-# CHECK: $lr = tMOVr $r0, 14
-# CHECK-NOT: $lr = t2LEUpdate
-
---- |
-  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-  target triple = "thumbv8.1m.main"
-  
-  define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
-  entry:
-    %scevgep = getelementptr i32, i32* %q, i32 -1
-    %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
-    br label %preheader
-
-  preheader:
-    br label %while.body
-  
-  while.body:                                       ; preds = %while.body, %entry
-    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ]
-    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ]
-    %0 = phi i32 [ %n, %preheader ], [ %2, %while.body ]
-    %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
-    %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
-    %1 = load i32, i32* %scevgep6, align 4
-    store i32 %1, i32* %scevgep2, align 4
-    %scevgep1 = getelementptr i32, i32* %lsr.iv, i32 1
-    %scevgep5 = getelementptr i32, i32* %lsr.iv4, i32 1
-    %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
-    %3 = icmp ne i32 %2, 0
-    br i1 %3, label %while.body, label %while.end
-  
-  while.end:                                        ; preds = %while.body
-    ret i32 0
-  }
-  
-  declare void @llvm.set.loop.iterations.i32(i32) #0
-  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
-  
-  attributes #0 = { noduplicate nounwind }
-  attributes #1 = { nounwind }
-
-...
----
-name:            do_copy
-alignment:       2
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-registers:       []
-liveins:
-  - { reg: '$r0', virtual-reg: '' }
-  - { reg: '$r1', virtual-reg: '' }
-  - { reg: '$r2', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       8
-  offsetAdjustment: 0
-  maxAlignment:    4
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 0
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:
-  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, 
-      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, 
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, 
-      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, 
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.0.entry:
-    successors: %bb.1(0x80000000)
-    liveins: $r0, $r1, $r2, $r7, $lr
-  
-    frame-setup tPUSH 14, $noreg, killed $r7, implicit-def $sp, implicit $sp
-    frame-setup CFI_INSTRUCTION def_cfa_offset 8
-    frame-setup CFI_INSTRUCTION offset $lr, -4
-    frame-setup CFI_INSTRUCTION offset $r7, -8
-    t2DoLoopStart $r0
-    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
-    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
-
-  bb.1.preheader:
-    successors: %bb.2(0x80000000)
-    liveins: $r0, $r1, $lr
-    $lr = tMOVr $r0, 14, $noreg
-  
-  bb.2.while.body:
-    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-    liveins: $lr, $r0, $r1
-  
-    renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6)
-    early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2)
-    renamable $lr = t2LoopDec killed renamable $lr, 1
-    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
-    tB %bb.3, 14, $noreg
-  
-  bb.3.while.end:
-    $r0, dead $cpsr = tMOVi8 0, 14, $noreg
-    tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0
-
-...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir
index 666ad4dd742a8..34b7cf1e72230 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir
@@ -8,11 +8,11 @@
     br i1 %cmp, label %exit, label %loop.ph
 
   loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
     br label %loop.body
 
   loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
     %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
     %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
     %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@@ -44,11 +44,11 @@
     br i1 %cmp, label %exit, label %loop.ph
 
   loop.ph:                                          ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %iters)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters)
     br label %loop.body
 
   loop.body:                                        ; preds = %loop.body, %loop.ph
-    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ]
     %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ]
     %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ]
     %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ]
@@ -75,7 +75,7 @@
     ret void
   }
 
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@@ -163,7 +163,7 @@ body:             |
     liveins: $r0, $r1, $r2, $r3, $r4, $lr
 
     renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
+    $lr = t2DoLoopStart renamable $r4
     $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
 
   bb.2.loop.body:
@@ -269,7 +269,7 @@ body:             |
     liveins: $r0, $r1, $r2, $r3, $r4, $lr
 
     renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r4
+    $lr = t2DoLoopStart renamable $r4
     $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg
 
   bb.2.loop.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir
index c0c04f997c35d..87cc5c5704d7f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir
@@ -9,7 +9,7 @@
   entry:
     %scevgep = getelementptr i32, i32* %q, i32 -1
     %scevgep3 = getelementptr i32, i32* %p, i32 -1
-    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
     br label %preheader
 
   preheader:
@@ -18,7 +18,7 @@
   while.body:                                       ; preds = %while.body, %entry
     %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ]
     %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ]
-    %0 = phi i32 [ %n, %preheader ], [ %2, %while.body ]
+    %0 = phi i32 [ %start, %preheader ], [ %2, %while.body ]
     %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1
     %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1
     %1 = load i32, i32* %scevgep6, align 4
@@ -33,7 +33,7 @@
     ret i32 0
   }
 
-  declare void @llvm.set.loop.iterations.i32(i32) #0
+  declare i32 @llvm.start.loop.iterations.i32(i32) #0
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
 
   attributes #0 = { noduplicate nounwind }
@@ -89,11 +89,12 @@ body:             |
   ; CHECK-LABEL: name: do_copy
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $lr, $r2, $r7
+  ; CHECK:   liveins: $r0, $r2, $r7
   ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   $lr = t2DLS killed $r0
   ; CHECK:   renamable $r0 = t2SUBri killed renamable $lr, 4, 14 /* CC::al */, $noreg, def dead $cpsr
   ; CHECK:   renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg
   ; CHECK: bb.1.preheader:
@@ -105,9 +106,7 @@ body:             |
   ; CHECK:   liveins: $lr, $r0, $r1
   ; CHECK:   renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep6)
   ; CHECK:   early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.scevgep2)
-  ; CHECK:   $lr = t2SUBri killed renamable $lr, 1, 14 /* CC::al */, $noreg, def $cpsr
-  ; CHECK:   tBcc %bb.2, 1 /* CC::ne */, killed $cpsr
-  ; CHECK:   tB %bb.3, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
   ; CHECK: bb.3.while.end:
   ; CHECK:   $r0, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
   ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
@@ -119,7 +118,7 @@ body:             |
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
-    t2DoLoopStart $r0
+    $lr = t2DoLoopStart $r0
     renamable $r0 = t2SUBri killed renamable $lr, 4, 14, $noreg, def $cpsr
     renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
index 1d9f7d72877e5..895f48a2771a3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
@@ -13,11 +13,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -51,11 +51,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -89,11 +89,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -127,11 +127,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@@ -165,11 +165,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -204,11 +204,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@@ -243,11 +243,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -282,11 +282,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@@ -321,11 +321,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -361,11 +361,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@@ -401,11 +401,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -440,11 +440,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@@ -479,11 +479,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -518,11 +518,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@@ -557,11 +557,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
@@ -596,11 +596,11 @@
     br i1 %cmp9, label %exit, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
     %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ]
@@ -635,7 +635,7 @@
     br i1 %cmp22, label %while.body.preheader, label %while.end
 
   while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %while.body
 
   while.body:                                       ; preds = %while.body.preheader, %while.body
@@ -643,7 +643,7 @@
     %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ]
     %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ]
     %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ]
-    %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ]
+    %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ]
     %tmp3 = bitcast i16* %y.addr.025 to <4 x i16>*
     %tmp1 = bitcast i16* %x.addr.026 to <4 x i16>*
     %tmp = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.023)
@@ -678,7 +678,7 @@
     br i1 %cmp22, label %while.body.preheader, label %while.end
 
   while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %while.body
 
   while.body:                                       ; preds = %while.body.preheader, %while.body
@@ -686,7 +686,7 @@
     %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ]
     %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ]
     %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ]
-    %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ]
+    %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ]
     %tmp3 = bitcast i16* %y.addr.025 to <8 x i16>*
     %tmp1 = bitcast i16* %x.addr.026 to <8 x i16>*
     %tmp = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.023)
@@ -720,7 +720,7 @@
     br i1 %cmp22, label %while.body.preheader, label %while.end
 
   while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %while.body
 
   while.body:                                       ; preds = %while.body.preheader, %while.body
@@ -728,7 +728,7 @@
     %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ]
     %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ]
     %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ]
-    %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ]
+    %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ]
     %tmp3 = bitcast i16* %y.addr.025 to <8 x i16>*
     %tmp1 = bitcast i16* %x.addr.026 to <8 x i16>*
     %tmp = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.023)
@@ -763,7 +763,7 @@
     br i1 %cmp22, label %while.body.preheader, label %while.end
 
   while.body.preheader:                             ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %4)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
     br label %while.body
 
   while.body:                                       ; preds = %while.body.preheader, %while.body
@@ -771,7 +771,7 @@
     %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ]
     %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ]
     %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ]
-    %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ]
+    %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ]
     %tmp3 = bitcast i16* %y.addr.025 to <8 x i16>*
     %tmp1 = bitcast i16* %x.addr.026 to <8 x i16>*
     %tmp = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.023)
@@ -803,7 +803,7 @@
   declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
   declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
   declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
@@ -887,7 +887,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -986,7 +986,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -1085,7 +1085,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -1185,7 +1185,7 @@ body:             |
     renamable $r2 = t2BICri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
     renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
     renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
     $r3 = tMOVr killed $r2, 14 /* CC::al */, $noreg
     renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
 
@@ -1304,7 +1304,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -1417,7 +1417,7 @@ body:             |
     renamable $r2 = t2BICri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
     renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
     renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
     $r3 = tMOVr killed $r2, 14 /* CC::al */, $noreg
     renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
 
@@ -1537,7 +1537,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -1650,7 +1650,7 @@ body:             |
     renamable $r2 = t2BICri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
     renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
     renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
     $r3 = tMOVr killed $r2, 14 /* CC::al */, $noreg
     renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
 
@@ -1779,7 +1779,7 @@ body:             |
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -1904,7 +1904,7 @@ body:             |
     renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0)
     renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
     $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -2032,7 +2032,7 @@ body:             |
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -2157,7 +2157,7 @@ body:             |
     renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0)
     renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
     $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -2285,7 +2285,7 @@ body:             |
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
     renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -2410,7 +2410,7 @@ body:             |
     renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0)
     renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 27, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
     $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -2538,7 +2538,7 @@ body:             |
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
     renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8)
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -2663,7 +2663,7 @@ body:             |
     renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0)
     renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 27, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
     $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg
 
   bb.2.vector.body:
@@ -2781,7 +2781,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.while.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -2897,7 +2897,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.while.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -3026,7 +3026,7 @@ body:             |
     renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
     renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.while.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -3150,7 +3150,7 @@ body:             |
     renamable $r12 = t2ADDri killed renamable $r2, 7, 14 /* CC::al */, $noreg, $noreg
     renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $r2 = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg
-    t2DoLoopStart renamable $r2
+    $lr = t2DoLoopStart renamable $r2
     $r12 = tMOVr killed $r2, 14 /* CC::al */, $noreg
     renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 7aa772c6394b6..1ea183d4a5fff 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -26,7 +26,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    ldr r0, [sp, #36]
 ; ENABLED-NEXT:    add.w r12, r2, #3
 ; ENABLED-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; ENABLED-NEXT:    movs r6, #0
+; ENABLED-NEXT:    mov.w r8, #0
 ; ENABLED-NEXT:    mov r9, r12
 ; ENABLED-NEXT:    uxth r0, r0
 ; ENABLED-NEXT:    rsbs r5, r0, #0
@@ -37,32 +37,32 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    lsrs r0, r0, #16
 ; ENABLED-NEXT:    sub.w r9, r9, #1
-; ENABLED-NEXT:    strh.w r0, [r1, r6, lsl #1]
-; ENABLED-NEXT:    adds r6, #1
+; ENABLED-NEXT:    strh.w r0, [r1, r8, lsl #1]
+; ENABLED-NEXT:    add.w r8, r8, #1
 ; ENABLED-NEXT:    add.w r10, r10, #2
-; ENABLED-NEXT:    cmp r6, r3
+; ENABLED-NEXT:    cmp r8, r3
 ; ENABLED-NEXT:    beq .LBB0_8
 ; ENABLED-NEXT:  .LBB0_4: @ %for.body
 ; ENABLED-NEXT:    @ =>This Loop Header: Depth=1
 ; ENABLED-NEXT:    @ Child Loop BB0_6 Depth 2
-; ENABLED-NEXT:    cmp r2, r6
+; ENABLED-NEXT:    cmp r2, r8
 ; ENABLED-NEXT:    ble .LBB0_2
 ; ENABLED-NEXT:  @ %bb.5: @ %vector.ph
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    bic r0, r9, #3
 ; ENABLED-NEXT:    movs r7, #1
 ; ENABLED-NEXT:    subs r0, #4
-; ENABLED-NEXT:    subs r4, r2, r6
+; ENABLED-NEXT:    sub.w r4, r2, r8
 ; ENABLED-NEXT:    vmov.i32 q1, #0x0
-; ENABLED-NEXT:    add.w r8, r7, r0, lsr #2
-; ENABLED-NEXT:    sub.w r0, r12, r6
+; ENABLED-NEXT:    add.w r6, r7, r0, lsr #2
+; ENABLED-NEXT:    sub.w r0, r12, r8
 ; ENABLED-NEXT:    bic r0, r0, #3
 ; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    add.w r0, r7, r0, lsr #2
 ; ENABLED-NEXT:    mov r7, r10
 ; ENABLED-NEXT:    dls lr, r0
 ; ENABLED-NEXT:    ldr r0, [sp] @ 4-byte Reload
-; ENABLED:  .LBB0_6: @ %vector.body
+; ENABLED-NEXT:  .LBB0_6: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
 ; ENABLED-NEXT:    vctp.32 r4
@@ -70,9 +70,9 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    vpstt
 ; ENABLED-NEXT:    vldrht.s32 q1, [r0], #8
 ; ENABLED-NEXT:    vldrht.s32 q2, [r7], #8
-; ENABLED-NEXT:    mov lr, r8
+; ENABLED-NEXT:    mov lr, r6
 ; ENABLED-NEXT:    vmul.i32 q1, q2, q1
-; ENABLED-NEXT:    sub.w r8, r8, #1
+; ENABLED-NEXT:    subs r6, #1
 ; ENABLED-NEXT:    vshl.s32 q1, r5
 ; ENABLED-NEXT:    subs r4, #4
 ; ENABLED-NEXT:    vadd.i32 q1, q1, q0
@@ -97,7 +97,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    ldr r0, [sp, #36]
 ; NOREDUCTIONS-NEXT:    add.w r12, r2, #3
 ; NOREDUCTIONS-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; NOREDUCTIONS-NEXT:    movs r6, #0
+; NOREDUCTIONS-NEXT:    mov.w r8, #0
 ; NOREDUCTIONS-NEXT:    mov r9, r12
 ; NOREDUCTIONS-NEXT:    uxth r0, r0
 ; NOREDUCTIONS-NEXT:    rsbs r5, r0, #0
@@ -108,31 +108,31 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    lsrs r0, r0, #16
 ; NOREDUCTIONS-NEXT:    sub.w r9, r9, #1
-; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r6, lsl #1]
-; NOREDUCTIONS-NEXT:    adds r6, #1
+; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r8, lsl #1]
+; NOREDUCTIONS-NEXT:    add.w r8, r8, #1
 ; NOREDUCTIONS-NEXT:    add.w r10, r10, #2
-; NOREDUCTIONS-NEXT:    cmp r6, r3
-; NOREDUCTIONS:         beq .LBB0_8
+; NOREDUCTIONS-NEXT:    cmp r8, r3
+; NOREDUCTIONS-NEXT:    beq .LBB0_8
 ; NOREDUCTIONS-NEXT:  .LBB0_4: @ %for.body
 ; NOREDUCTIONS-NEXT:    @ =>This Loop Header: Depth=1
 ; NOREDUCTIONS-NEXT:    @ Child Loop BB0_6 Depth 2
-; NOREDUCTIONS-NEXT:    cmp r2, r6
+; NOREDUCTIONS-NEXT:    cmp r2, r8
 ; NOREDUCTIONS-NEXT:    ble .LBB0_2
 ; NOREDUCTIONS-NEXT:  @ %bb.5: @ %vector.ph
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    bic r0, r9, #3
 ; NOREDUCTIONS-NEXT:    movs r7, #1
 ; NOREDUCTIONS-NEXT:    subs r0, #4
-; NOREDUCTIONS-NEXT:    subs r4, r2, r6
+; NOREDUCTIONS-NEXT:    sub.w r4, r2, r8
 ; NOREDUCTIONS-NEXT:    vmov.i32 q1, #0x0
-; NOREDUCTIONS-NEXT:    add.w r8, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT:    sub.w r0, r12, r6
+; NOREDUCTIONS-NEXT:    add.w r6, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT:    sub.w r0, r12, r8
 ; NOREDUCTIONS-NEXT:    bic r0, r0, #3
 ; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    add.w r0, r7, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    mov r7, r10
 ; NOREDUCTIONS-NEXT:    dls lr, r0
-; NOREDUCTIONS:         ldr r0, [sp] @ 4-byte Reload
+; NOREDUCTIONS-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
 ; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -141,9 +141,9 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    vpstt
 ; NOREDUCTIONS-NEXT:    vldrht.s32 q1, [r0], #8
 ; NOREDUCTIONS-NEXT:    vldrht.s32 q2, [r7], #8
-; NOREDUCTIONS-NEXT:    mov lr, r8
+; NOREDUCTIONS-NEXT:    mov lr, r6
 ; NOREDUCTIONS-NEXT:    vmul.i32 q1, q2, q1
-; NOREDUCTIONS-NEXT:    sub.w r8, r8, #1
+; NOREDUCTIONS-NEXT:    subs r6, #1
 ; NOREDUCTIONS-NEXT:    vshl.s32 q1, r5
 ; NOREDUCTIONS-NEXT:    subs r4, #4
 ; NOREDUCTIONS-NEXT:    vadd.i32 q1, q1, q0
@@ -184,7 +184,7 @@ for.body:                                         ; preds = %for.end, %for.body.
 
 vector.ph:                                        ; preds = %for.body
   %trip.count.minus.1 = add i32 %i8, -1
-  call void @llvm.set.loop.iterations.i32(i32 %i7)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %i7)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -192,7 +192,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %i16, %vector.body ]
-  %i9 = phi i32 [ %i7, %vector.ph ], [ %i17, %vector.body ]
+  %i9 = phi i32 [ %start, %vector.ph ], [ %i17, %vector.body ]
   %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>*
   %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>*
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i8)
@@ -237,4 +237,4 @@ declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
index c8001df58e8cc..755096f25eb30 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
@@ -51,8 +51,8 @@ define i32 @vcmp_new_vpst_combination(i32 %len, i32* nocapture readonly %arr) {
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vmov.i32 q1, #0x1
-; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.32 lr, r0
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
index 4308c7e50edaa..b9ce1bfb4d706 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
@@ -17,11 +17,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
@@ -56,7 +56,7 @@
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 
@@ -174,7 +174,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
index c5da60f05ff71..f0c2d9e873d72 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
@@ -17,13 +17,13 @@
     br i1 %tmp, label %bb27, label %bb3
 
   bb3:                                              ; preds = %bb
-    call void @llvm.set.loop.iterations.i32(i32 %tmp6)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp6)
     br label %bb9
 
   bb9:                                              ; preds = %bb9, %bb3
     %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ]
     %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
-    %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ]
+    %tmp7 = phi i32 [ %start, %bb3 ], [ %tmp12, %bb9 ]
     %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
     %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
     %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -47,7 +47,7 @@
   }
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
   declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
@@ -163,7 +163,7 @@ body:             |
     VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
     $r3 = tMOVr $r0, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.bb9:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
index 54e072cf58b94..5b9418390d93c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
@@ -16,13 +16,13 @@
     br i1 %tmp, label %bb27, label %bb3
 
   bb3:                                              ; preds = %bb
-    call void @llvm.set.loop.iterations.i32(i32 %tmp6)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp6)
     br label %bb9
 
   bb9:                                              ; preds = %bb9, %bb3
     %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ]
     %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
-    %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ]
+    %tmp7 = phi i32 [ %start, %bb3 ], [ %tmp12, %bb9 ]
     %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
     %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
     %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
@@ -78,7 +78,7 @@
 
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
   declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4
@@ -193,7 +193,7 @@ body:             |
     VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
     $r3 = tMOVr $r0, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.bb9:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -335,7 +335,7 @@ body:             |
     VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
     $r3 = tMOVr $r0, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.bb9:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -478,7 +478,7 @@ body:             |
     VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
     $r3 = tMOVr $r0, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.bb9:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -621,7 +621,7 @@ body:             |
     VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
     $r3 = tMOVr $r0, 14, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.bb9:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir
index b487fa044e86d..285cb46465f3c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir
@@ -14,14 +14,14 @@
     br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
     %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@@ -42,7 +42,7 @@
   for.cond.cleanup:                                 ; preds = %vector.body, %entry
     ret void
   }
-  declare void @llvm.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.start.loop.iterations.i32(i32) #1
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #2
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
   declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3
@@ -143,7 +143,7 @@ body:             |
     renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir
index 9790c04d204ff..6a5d7496d21e2 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir
@@ -14,14 +14,14 @@
     br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
     %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@@ -42,7 +42,7 @@
   for.cond.cleanup:                                 ; preds = %vector.body, %entry
     ret void
   }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@@ -142,7 +142,7 @@ body:             |
     renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir
index 65ebff2f696bf..ef702514d5700 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir
@@ -14,14 +14,14 @@
     br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
     %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
     %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
@@ -42,7 +42,7 @@
   for.cond.cleanup:                                 ; preds = %vector.body, %entry
     ret void
   }
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
@@ -142,7 +142,7 @@ body:             |
     renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
index 9799ceb98c676..542412ec43fdb 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
@@ -14,7 +14,7 @@
     br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     %6 = shl i32 %4, 3
     %7 = sub i32 %N, %6
     br label %vector.body
@@ -23,7 +23,7 @@
     %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %c, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %b, %vector.ph ]
     %vec.phi = phi <8 x i16> [ <i16 32767, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, %vector.ph ], [ %15, %vector.body ]
-    %8 = phi i32 [ %5, %vector.ph ], [ %16, %vector.body ]
+    %8 = phi i32 [ %start, %vector.ph ], [ %16, %vector.body ]
     %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ]
     %lsr.iv2022 = bitcast i8* %lsr.iv20 to <8 x i8>*
     %lsr.iv19 = bitcast i8* %lsr.iv to <8 x i8>*
@@ -55,7 +55,7 @@
   }
   declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
   declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)
 
@@ -189,7 +189,7 @@ body:             |
     renamable $r12 = t2LSRri killed renamable $r12, 3, 14, $noreg, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool)
     renamable $r3 = t2SUBrs renamable $r2, killed renamable $r12, 26, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index 422fc3c874da9..f06e06a0a471c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -15,10 +15,10 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
-; CHECK:  .LBB0_2: @ %vector.body
+; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
 ; CHECK-NEXT:    vmov q1, q0
@@ -92,10 +92,10 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    bic r1, r1, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    dls lr, lr
-; CHECK:  .LBB1_2: @ %vector.body
+; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
 ; CHECK-NEXT:    vmov q1, q0
@@ -163,10 +163,10 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    bic r1, r1, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    dls lr, lr
-; CHECK:  .LBB2_2: @ %vector.body
+; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
 ; CHECK-NEXT:    vmov q1, q0
@@ -228,9 +228,9 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
-; CHECK:  .LBB3_2: @ %vector.body
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
@@ -285,9 +285,9 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
-; CHECK:  .LBB4_2: @ %vector.body
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
@@ -342,9 +342,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.8 lr, r3
-; CHECK:  .LBB5_2: @ %vector.body
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #16
 ; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
@@ -402,9 +402,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.16 lr, r3
-; CHECK:  .LBB6_2: @ %vector.body
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1], #16

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index c05ed7dcfcfb0..e8da32611be2a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: vec_mul_reduce_add
 
 ; CHECK: vector.ph:
-; CHECK:  call void @llvm.set.loop.iterations.i32
+; CHECK:  %start = call i32 @llvm.start.loop.iterations.i32
 ; CHECK:  br label %vector.body
 
 ; CHECK: vector.body:
@@ -33,7 +33,7 @@ vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %5)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -41,7 +41,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ]
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
-  %6 = phi i32 [ %5, %vector.ph ], [ %10, %vector.body ]
+  %6 = phi i32 [ %start, %vector.ph ], [ %10, %vector.body ]
   %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
   %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
@@ -74,6 +74,6 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll
index f1a35af8b57ed..4f8add4860958 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll
@@ -23,7 +23,7 @@ vector.ph:                                        ; preds = %entry
   br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %vector.ph.new
 
 vector.ph.new:                                    ; preds = %vector.ph
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   %unroll_iter = sub i32 %tmp13, %xtraiter
   br label %vector.body
 
@@ -113,6 +113,6 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1
 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) #2
-declare void @llvm.set.loop.iterations.i32(i32) #3
+declare i32 @llvm.start.loop.iterations.i32(i32) #3
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
index 60a578d81594f..f7e1d86fd1b0c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
@@ -18,12 +18,12 @@
 
   vector.ph:                                        ; preds = %entry
     %sub = sub nsw i32 0, %x
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %A, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %18, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %18, %vector.body ]
     %7 = phi i32 [ %n, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>*
     %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
@@ -84,12 +84,12 @@
 
   vector.ph:                                        ; preds = %entry
     %sub = sub nsw i32 0, %T
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
     %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %data, %vector.ph ]
-    %6 = phi i32 [ %5, %vector.ph ], [ %18, %vector.body ]
+    %6 = phi i32 [ %start, %vector.ph ], [ %18, %vector.body ]
     %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
     %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>*
     %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
@@ -151,7 +151,7 @@
 
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 ...
@@ -251,7 +251,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -382,7 +382,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -512,7 +512,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -632,7 +632,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -748,7 +748,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -862,7 +862,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)
@@ -983,7 +983,7 @@ body:             |
     renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
     renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
     renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
index fa7304ebe6ba5..f5d4669a909bf 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
@@ -14,7 +14,7 @@
     br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     %6 = shl i32 %4, 3
     %7 = sub i32 %N, %6
     br label %vector.body
@@ -23,7 +23,7 @@
     %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %c, %vector.ph ]
     %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %b, %vector.ph ]
     %vec.phi = phi <8 x i16> [ <i16 32767, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, %vector.ph ], [ %15, %vector.body ]
-    %8 = phi i32 [ %5, %vector.ph ], [ %16, %vector.body ]
+    %8 = phi i32 [ %start, %vector.ph ], [ %16, %vector.body ]
     %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ]
     %lsr.iv2022 = bitcast i8* %lsr.iv20 to <8 x i8>*
     %lsr.iv19 = bitcast i8* %lsr.iv to <8 x i8>*
@@ -55,7 +55,7 @@
   }
   declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
   declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
-  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)
 
@@ -189,7 +189,7 @@ body:             |
     renamable $r12 = t2LSRri killed renamable $r12, 2, 14, $noreg, $noreg
     renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool)
     renamable $r3 = t2SUBrs renamable $r2, killed renamable $r12, 26, 14, $noreg, $noreg
-    t2DoLoopStart renamable $lr
+    $lr = t2DoLoopStart renamable $lr
 
   bb.2.vector.body:
     successors: %bb.2(0x7c000000), %bb.3(0x04000000)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
index 7ef303a1a9499..75df351ac0e85 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
@@ -17,11 +17,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %tmp5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
@@ -61,7 +61,7 @@
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 
@@ -182,7 +182,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
     renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r3
+    $lr = t2DoLoopStart renamable $r3
     $r12 = tMOVr killed $r3, 14, $noreg
     $r3 = tMOVr $r2, 14, $noreg
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
index 00abf1603fb6f..018a2dc7f6211 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
@@ -16,11 +16,11 @@
     br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
 
   vector.ph:                                        ; preds = %entry
-    call void @llvm.set.loop.iterations.i32(i32 %5)
+    %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
     br label %vector.body
 
   vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %5, %vector.ph ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
     %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
     %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
     %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
@@ -54,7 +54,7 @@
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.start.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
 ...
@@ -170,7 +170,7 @@ body:             |
     renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
     renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
     renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
-    t2DoLoopStart renamable $r12
+    $lr = t2DoLoopStart renamable $r12
     $r3 = tMOVr killed $r12, 14, $noreg
 
   bb.2.vector.body:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index d364eb97fff72..98bfb4e6f9897 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -723,9 +723,9 @@ define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias
 ; CHECK:       @ %bb.0: @ %for.body.us.preheader
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    ldrd lr, r12, [sp, #16]
+; CHECK-NEXT:    ldrd r3, r12, [sp, #16]
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    lsl.w r3, r12, #1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB14_1: @ %for.body.us
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB14_2 Depth 2
@@ -1083,11 +1083,11 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
 ; CHECK-NEXT:    ldrh r4, [r0]
-; CHECK-NEXT:    lsr.w r10, r3, #2
+; CHECK-NEXT:    lsr.w r9, r3, #2
 ; CHECK-NEXT:    ldrd r5, r12, [r0, #4]
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w r0, r4, #8
-; CHECK-NEXT:    and r9, r0, #7
+; CHECK-NEXT:    and r8, r0, #7
 ; CHECK-NEXT:    add.w r7, r0, r0, lsr #29
 ; CHECK-NEXT:    asrs r6, r7, #3
 ; CHECK-NEXT:    cmp r6, #1
@@ -1106,7 +1106,7 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:  .LBB16_3: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    subs.w r10, r10, #1
+; CHECK-NEXT:    subs.w r9, r9, #1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
 ; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
@@ -1126,13 +1126,13 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    ldrh.w r3, [r12, #6]
 ; CHECK-NEXT:    ldrh.w r6, [r12, #4]
 ; CHECK-NEXT:    ldrh.w r11, [r12, #2]
-; CHECK-NEXT:    ldrh.w r8, [r12]
+; CHECK-NEXT:    ldrh.w r10, [r12]
 ; CHECK-NEXT:    vstrb.8 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
 ; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r5, #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vmul.f16 q0, q0, r8
+; CHECK-NEXT:    vmul.f16 q0, q0, r10
 ; CHECK-NEXT:    adds r1, r5, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r11
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
@@ -1155,8 +1155,8 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    blo .LBB16_7
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr.w lr, [sp] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_6: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
@@ -1196,13 +1196,13 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_8: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    cmp.w r9, #0
+; CHECK-NEXT:    cmp.w r8, #0
 ; CHECK-NEXT:    beq.w .LBB16_3
 ; CHECK-NEXT:    b .LBB16_9
 ; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov lr, r9
+; CHECK-NEXT:    mov lr, r8
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -1214,7 +1214,7 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    b .LBB16_11
 ; CHECK-NEXT:  .LBB16_11: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    add.w r5, r5, r9, lsl #1
+; CHECK-NEXT:    add.w r5, r5, r8, lsl #1
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
 ; CHECK-NEXT:    add sp, #24

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 4fe8877aa8bd4..94cad933a4e8f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -701,9 +701,9 @@ define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noali
 ; CHECK:       @ %bb.0: @ %for.body.us.preheader
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    ldrd lr, r12, [sp, #16]
+; CHECK-NEXT:    ldrd r3, r12, [sp, #16]
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    lsl.w r3, r12, #2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB14_1: @ %for.body.us
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB14_2 Depth 2
@@ -1088,8 +1088,8 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
-; CHECK-NEXT:    ldrd r7, r4, [r12]
-; CHECK-NEXT:    ldrd r0, r6, [r12, #8]
+; CHECK-NEXT:    ldrd r7, r6, [r12]
+; CHECK-NEXT:    ldrd r0, r4, [r12, #8]
 ; CHECK-NEXT:    ldrd r3, lr, [r12, #16]
 ; CHECK-NEXT:    ldrd r11, r8, [r12, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
@@ -1099,11 +1099,11 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    vmul.f32 q0, q0, r7
 ; CHECK-NEXT:    vldrw.u32 q6, [r5, #-24]
 ; CHECK-NEXT:    vldrw.u32 q4, [r5, #-20]
-; CHECK-NEXT:    vfma.f32 q0, q1, r4
+; CHECK-NEXT:    vfma.f32 q0, q1, r6
 ; CHECK-NEXT:    vldrw.u32 q5, [r5, #-16]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r0
 ; CHECK-NEXT:    vldrw.u32 q2, [r5, #-12]
-; CHECK-NEXT:    vfma.f32 q0, q4, r6
+; CHECK-NEXT:    vfma.f32 q0, q4, r4
 ; CHECK-NEXT:    vldrw.u32 q3, [r5, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r3
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
@@ -1115,8 +1115,8 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    blo .LBB16_7
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr.w lr, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_6: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
@@ -1155,13 +1155,13 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    b .LBB16_9
 ; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    mov r4, r5
+; CHECK-NEXT:    mov r6, r5
 ; CHECK-NEXT:    mov lr, r3
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldr r0, [r7], #4
-; CHECK-NEXT:    vldrw.u32 q1, [r4], #4
+; CHECK-NEXT:    vldrw.u32 q1, [r6], #4
 ; CHECK-NEXT:    subs.w lr, lr, #1
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
 ; CHECK-NEXT:    bne .LBB16_10
@@ -1404,15 +1404,13 @@ if.end:                                           ; preds = %while.end, %if.then
 define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biquad_cascade_stereo_df2T_instance_f32* nocapture readonly %0, float* %1, float* %2, i32 %3) {
 ; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    .pad #24
 ; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    ldrb.w lr, [r0]
+; CHECK-NEXT:    ldrb.w r8, [r0]
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    ldrd r12, r0, [r0, #4]
 ; CHECK-NEXT:    cmp r3, #0
@@ -1424,45 +1422,43 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biqu
 ; CHECK-NEXT:    mov r4, sp
 ; CHECK-NEXT:  .LBB17_2: @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB17_3 Depth 2
-; CHECK-NEXT:    mov r7, lr
-; CHECK-NEXT:    ldr.w lr, [r0, #12]
-; CHECK-NEXT:    ldrd r5, r6, [r0]
+; CHECK-NEXT:    ldrd r5, r7, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r12]
-; CHECK-NEXT:    vldr s12, [r0, #8]
-; CHECK-NEXT:    vdup.32 q2, lr
-; CHECK-NEXT:    vldr s14, [r0, #16]
+; CHECK-NEXT:    vldr s8, [r0, #8]
+; CHECK-NEXT:    ldr r6, [r0, #12]
 ; CHECK-NEXT:    vstrw.32 q1, [r4]
-; CHECK-NEXT:    vdup.32 q1, r6
-; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s10, s14
+; CHECK-NEXT:    vdup.32 q1, r7
+; CHECK-NEXT:    vldr s12, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s6, s8
 ; CHECK-NEXT:    dls lr, r3
-; CHECK-NEXT:    vmov.f32 s7, s12
-; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vmov.f32 s7, s8
+; CHECK-NEXT:    vdup.32 q2, r6
+; CHECK-NEXT:    vmov.f32 s10, s12
+; CHECK-NEXT:    mov r7, r2
+; CHECK-NEXT:    vmov.f32 s11, s12
 ; CHECK-NEXT:  .LBB17_3: @ Parent Loop BB17_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q4, [r1, q0, uxtw #2]
 ; CHECK-NEXT:    vldrw.u32 q5, [r4, q0, uxtw #2]
 ; CHECK-NEXT:    adds r1, #8
 ; CHECK-NEXT:    vfma.f32 q5, q4, r5
-; CHECK-NEXT:    vstmia r6, {s20, s21}
-; CHECK-NEXT:    adds r6, #8
+; CHECK-NEXT:    vstmia r7, {s20, s21}
+; CHECK-NEXT:    adds r7, #8
 ; CHECK-NEXT:    vldrw.u32 q3, [sp, #8]
 ; CHECK-NEXT:    vfma.f32 q3, q5, q2
 ; CHECK-NEXT:    vfma.f32 q3, q4, q1
 ; CHECK-NEXT:    vstrw.32 q3, [r4]
 ; CHECK-NEXT:    le lr, .LBB17_3
 ; CHECK-NEXT:  @ %bb.4: @ in Loop: Header=BB17_2 Depth=1
-; CHECK-NEXT:    mov lr, r7
-; CHECK-NEXT:    adds r0, #20
-; CHECK-NEXT:    subs.w lr, r7, #1
+; CHECK-NEXT:    subs.w r8, r8, #1
+; CHECK-NEXT:    add.w r0, r0, #20
 ; CHECK-NEXT:    vstrb.8 q3, [r12], #16
 ; CHECK-NEXT:    mov r1, r2
 ; CHECK-NEXT:    bne .LBB17_2
 ; CHECK-NEXT:    b .LBB17_7
 ; CHECK-NEXT:  .LBB17_5: @ %.preheader
+; CHECK-NEXT:    dls lr, r8
 ; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB17_6: @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r12], #16
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
@@ -1470,8 +1466,7 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biqu
 ; CHECK-NEXT:  .LBB17_7:
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
   %5 = alloca [6 x float], align 4
   %6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 1
   %7 = load float*, float** %6, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
index 86cbec661f1f5..c582f726967f9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
@@ -11,8 +11,8 @@ define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@@ -74,8 +74,8 @@ define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@@ -138,8 +138,8 @@ define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@@ -201,8 +201,8 @@ define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@@ -332,8 +332,8 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vdup.32 q0, r4
 ; CHECK-NEXT:    vneg.f32 q0, q0
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #4
@@ -396,19 +396,28 @@ define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
+; CHECK-NEXT:    add.w r12, r3, #3
+; CHECK-NEXT:    mov.w lr, #1
+; CHECK-NEXT:    bic r12, r12, #3
+; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #2
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    vdup.32 q0, r12
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.32 r3
 ; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
-; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
 ; CHECK-NEXT:    vfms.f32 q3, q2, q1
-; CHECK-NEXT:    vstrw.32 q3, [r2], #16
-; CHECK-NEXT:    letp lr, .LBB6_2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q3, [r2], #16
+; CHECK-NEXT:    le lr, .LBB6_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -462,19 +471,28 @@ define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB7_1: @ %vector.ph
+; CHECK-NEXT:    add.w r12, r3, #3
+; CHECK-NEXT:    mov.w lr, #1
+; CHECK-NEXT:    bic r12, r12, #3
+; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #2
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    vdup.32 q0, r12
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB7_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.32 r3
 ; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
 ; CHECK-NEXT:    vfms.f32 q3, q2, q1
-; CHECK-NEXT:    vstrw.32 q3, [r2], #16
-; CHECK-NEXT:    letp lr, .LBB7_2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q3, [r2], #16
+; CHECK-NEXT:    le lr, .LBB7_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -593,18 +611,27 @@ define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB9_1: @ %vector.ph
+; CHECK-NEXT:    add.w r12, r3, #3
+; CHECK-NEXT:    mov.w lr, #1
+; CHECK-NEXT:    bic r12, r12, #3
+; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #2
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    vdup.32 q0, r12
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.32 r3
 ; CHECK-NEXT:    add.w r12, r12, #4
-; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
 ; CHECK-NEXT:    vfms.f32 q2, q1, q0
-; CHECK-NEXT:    vstrw.32 q2, [r2], #16
-; CHECK-NEXT:    letp lr, .LBB9_2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q2, [r2], #16
+; CHECK-NEXT:    le lr, .LBB9_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -659,8 +686,8 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB10_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB10_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -724,8 +751,8 @@ define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB11_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:  .LBB11_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
index 733b17bdb798e..246ce33eaba60 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
@@ -6,10 +6,10 @@ define void @to_4(float* nocapture readonly %x, half* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w r2, #256
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI0_0
-; CHECK-NEXT:    mov.w lr, #256
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
@@ -52,10 +52,10 @@ define void @to_8(float* nocapture readonly %x, half* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #128
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI1_0
-; CHECK-NEXT:    mov.w lr, #128
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
@@ -102,10 +102,10 @@ define void @to_16(float* nocapture readonly %x, half* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #64
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI2_0
-; CHECK-NEXT:    mov.w lr, #64
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
@@ -160,10 +160,10 @@ define void @from_4(half* nocapture readonly %x, float* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w r2, #256
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI3_0
-; CHECK-NEXT:    mov.w lr, #256
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q1, [r0], #8
@@ -206,10 +206,10 @@ define void @from_8(half* nocapture readonly %x, float* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #128
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI4_0
-; CHECK-NEXT:    mov.w lr, #128
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
@@ -256,10 +256,10 @@ define void @from_16(half* nocapture readonly %x, float* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #64
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI5_0
-; CHECK-NEXT:    mov.w lr, #64
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #24]
@@ -314,10 +314,10 @@ define void @both_4(half* nocapture readonly %x, half* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w r2, #256
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI6_0
-; CHECK-NEXT:    mov.w lr, #256
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q1, [r0], #8
@@ -362,10 +362,10 @@ define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #128
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI7_0
-; CHECK-NEXT:    mov.w lr, #128
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
@@ -415,10 +415,10 @@ define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #64
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI8_0
-; CHECK-NEXT:    mov.w lr, #64
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #24]
@@ -478,10 +478,10 @@ define void @both_8_I(half* nocapture readonly %x, half* noalias nocapture %y) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #128
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI9_0
-; CHECK-NEXT:    mov.w lr, #128
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
@@ -534,10 +534,10 @@ define void @both_16_I(half* nocapture readonly %x, half* noalias nocapture %y)
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #128
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    adr r2, .LCPI10_0
-; CHECK-NEXT:    mov.w lr, #128
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB10_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index cc9ec3fbffca2..0f3a91ca31af0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -328,9 +328,9 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read
 ; CHECK-NEXT:  .LBB8_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB8_3 Depth 2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:  .LBB8_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB8_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -405,11 +405,11 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(i32* noalias nocapture rea
 ; CHECK-NEXT:  .LBB9_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB9_3 Depth 2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB9_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -501,9 +501,9 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(i32* noalias nocapture reado
 ; CHECK-NEXT:  .LBB10_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB10_3 Depth 2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:  .LBB10_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB10_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -570,20 +570,21 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    blt .LBB11_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    bic r1, r2, #7
-; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    sub.w r3, r1, #8
+; CHECK-NEXT:    bic r8, r2, #7
+; CHECK-NEXT:    movs r5, #1
+; CHECK-NEXT:    sub.w r6, r8, #8
 ; CHECK-NEXT:    vmov.i16 q1, #0x8
+; CHECK-NEXT:    add.w r1, r5, r6, lsr #3
+; CHECK-NEXT:    adr r6, .LCPI11_0
+; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
-; CHECK-NEXT:    add.w r8, r6, r3, lsr #3
-; CHECK-NEXT:    adr r3, .LCPI11_0
-; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:  .LBB11_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB11_3 Depth 2
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, r8
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB11_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -606,7 +607,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    vmov r7, s16
 ; CHECK-NEXT:    vmov.32 q3[2], r5
 ; CHECK-NEXT:    vmov.u16 r5, q2[3]
-; CHECK-NEXT:    vmov r4, s17
+; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    vmov.32 q3[3], r5
 ; CHECK-NEXT:    vadd.i16 q2, q2, q1
 ; CHECK-NEXT:    vmovlb.s16 q3, q3
@@ -617,7 +618,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    vmov r12, s13
 ; CHECK-NEXT:    ldrh.w r11, [r7]
 ; CHECK-NEXT:    vmov r7, s12
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r3, [r3]
 ; CHECK-NEXT:    ldrh.w r9, [r5]
 ; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    ldrh.w r10, [r6]
@@ -630,16 +631,15 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    vmov.16 q3[3], r9
 ; CHECK-NEXT:    vmov.16 q3[4], r11
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q3[5], r4
+; CHECK-NEXT:    vmov.16 q3[5], r3
 ; CHECK-NEXT:    ldrh r6, [r6]
 ; CHECK-NEXT:    vmov.16 q3[6], r5
 ; CHECK-NEXT:    vmov.16 q3[7], r6
-; CHECK-NEXT:    vstrb.8 q3, [r3], #16
+; CHECK-NEXT:    vstrb.8 q3, [r4], #16
 ; CHECK-NEXT:    le lr, .LBB11_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=1
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    cmp r1, r2
+; CHECK-NEXT:    cmp r8, r2
 ; CHECK-NEXT:    bne .LBB11_2
 ; CHECK-NEXT:  .LBB11_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #8
@@ -704,42 +704,43 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
 ; CHECK-NEXT:    blt.w .LBB12_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    bic r1, r2, #7
-; CHECK-NEXT:    movs r7, #1
-; CHECK-NEXT:    sub.w r3, r1, #8
+; CHECK-NEXT:    bic r8, r2, #7
 ; CHECK-NEXT:    adr r6, .LCPI12_2
+; CHECK-NEXT:    sub.w r3, r8, #8
+; CHECK-NEXT:    vldrw.u32 q0, [r6]
+; CHECK-NEXT:    movs r7, #1
 ; CHECK-NEXT:    vmov.i16 q3, #0x18
-; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
-; CHECK-NEXT:    add.w r8, r7, r3, lsr #3
-; CHECK-NEXT:    adr r7, .LCPI12_1
-; CHECK-NEXT:    vldrw.u32 q0, [r7]
+; CHECK-NEXT:    add.w r1, r7, r3, lsr #3
 ; CHECK-NEXT:    adr r3, .LCPI12_0
-; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    adr r7, .LCPI12_1
+; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r7]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB12_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
-; CHECK-NEXT:    dls lr, r8
-; CHECK-NEXT:    ldr r3, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    ldr r4, [sp, #60] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
 ; CHECK-NEXT:  .LBB12_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vmov.u16 r4, q5[0]
+; CHECK-NEXT:    vmov.u16 r3, q5[0]
 ; CHECK-NEXT:    vmov.u16 r7, q7[4]
-; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[1]
-; CHECK-NEXT:    vmov.32 q0[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[2]
-; CHECK-NEXT:    vmov.32 q0[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[3]
-; CHECK-NEXT:    vmov.32 q0[3], r4
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[1]
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[2]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[3]
+; CHECK-NEXT:    vmov.32 q0[3], r3
 ; CHECK-NEXT:    vmov.u16 r12, q6[0]
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vmov.32 q1[0], r12
@@ -747,7 +748,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    vmov.u16 r1, q6[1]
 ; CHECK-NEXT:    vadd.i32 q2, q0, r0
 ; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    vmov.u16 r1, q6[2]
 ; CHECK-NEXT:    vmov.32 q1[2], r1
 ; CHECK-NEXT:    vmov.u16 r1, q6[3]
@@ -757,26 +758,26 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    vmov r6, s11
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q4, q1, r0
-; CHECK-NEXT:    ldrh.w r9, [r4]
-; CHECK-NEXT:    vmov.u16 r4, q5[4]
-; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[5]
-; CHECK-NEXT:    vmov.32 q0[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[6]
-; CHECK-NEXT:    vmov.32 q0[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[7]
-; CHECK-NEXT:    vmov.32 q0[3], r4
+; CHECK-NEXT:    ldrh.w r9, [r3]
+; CHECK-NEXT:    vmov.u16 r3, q5[4]
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[5]
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[6]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[7]
+; CHECK-NEXT:    vmov.32 q0[3], r3
 ; CHECK-NEXT:    ldrh r6, [r6]
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    ldrh.w r10, [r4]
-; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    ldrh.w r10, [r3]
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    ldrh.w r11, [r4]
-; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    ldrh.w r11, [r3]
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    vmov.32 q0[0], r7
 ; CHECK-NEXT:    vmov.u16 r7, q7[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r7
@@ -811,7 +812,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    vshl.i32 q3, q3, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r3, [r3]
 ; CHECK-NEXT:    ldrh r7, [r7]
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    vmov.16 q1[0], r1
@@ -823,7 +824,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    vmov.16 q1[3], r6
 ; CHECK-NEXT:    vmov.16 q1[4], r10
 ; CHECK-NEXT:    vmov.16 q1[5], r11
-; CHECK-NEXT:    vmov.16 q1[6], r4
+; CHECK-NEXT:    vmov.16 q1[6], r3
 ; CHECK-NEXT:    vmov.16 q1[7], r5
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    vmov.16 q2[0], r1
@@ -877,12 +878,11 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    vmov.16 q0[7], r1
 ; CHECK-NEXT:    vadd.i16 q0, q0, q2
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vstrb.8 q0, [r3], #16
+; CHECK-NEXT:    vstrb.8 q0, [r4], #16
 ; CHECK-NEXT:    le lr, .LBB12_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
-; CHECK-NEXT:    cmp r1, r2
+; CHECK-NEXT:    cmp r8, r2
 ; CHECK-NEXT:    bne.w .LBB12_2
 ; CHECK-NEXT:  .LBB12_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #104
@@ -892,6 +892,15 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
 ; CHECK-NEXT:  .LCPI12_0:
+; CHECK-NEXT:    .short 1 @ 0x1
+; CHECK-NEXT:    .short 4 @ 0x4
+; CHECK-NEXT:    .short 7 @ 0x7
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 13 @ 0xd
+; CHECK-NEXT:    .short 16 @ 0x10
+; CHECK-NEXT:    .short 19 @ 0x13
+; CHECK-NEXT:    .short 22 @ 0x16
+; CHECK-NEXT:  .LCPI12_1:
 ; CHECK-NEXT:    .short 0 @ 0x0
 ; CHECK-NEXT:    .short 3 @ 0x3
 ; CHECK-NEXT:    .short 6 @ 0x6
@@ -900,7 +909,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    .short 15 @ 0xf
 ; CHECK-NEXT:    .short 18 @ 0x12
 ; CHECK-NEXT:    .short 21 @ 0x15
-; CHECK-NEXT:  .LCPI12_1:
+; CHECK-NEXT:  .LCPI12_2:
 ; CHECK-NEXT:    .short 2 @ 0x2
 ; CHECK-NEXT:    .short 5 @ 0x5
 ; CHECK-NEXT:    .short 8 @ 0x8
@@ -909,15 +918,6 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    .short 17 @ 0x11
 ; CHECK-NEXT:    .short 20 @ 0x14
 ; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:  .LCPI12_2:
-; CHECK-NEXT:    .short 1 @ 0x1
-; CHECK-NEXT:    .short 4 @ 0x4
-; CHECK-NEXT:    .short 7 @ 0x7
-; CHECK-NEXT:    .short 10 @ 0xa
-; CHECK-NEXT:    .short 13 @ 0xd
-; CHECK-NEXT:    .short 16 @ 0x10
-; CHECK-NEXT:    .short 19 @ 0x13
-; CHECK-NEXT:    .short 22 @ 0x16
 
 
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 2a86ddbede65c..3e8190fff47d1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -740,8 +740,8 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
 ; CHECK-NEXT:  .LBB22_1: @ %vector.body.preheader
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r2, r3, r2, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:  .LBB22_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
@@ -786,8 +786,8 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
 ; CHECK-NEXT:  .LBB23_1: @ %vector.body.preheader
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r2, r3, r2, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:  .LBB23_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index 8bafe44b45c07..2f2f05549cd42 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -448,38 +448,38 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32*
 ; CHECK-NEXT:    ldrd r9, r12, [sp, #128]
 ; CHECK-NEXT:    sub.w r7, r12, #1
 ; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    adr r5, .LCPI9_0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    add.w r7, r6, r7, lsr #1
 ; CHECK-NEXT:    vdup.32 q1, r9
 ; CHECK-NEXT:    bic r7, r7, #3
-; CHECK-NEXT:    vldrw.u32 q2, [r5]
-; CHECK-NEXT:    subs r7, #4
-; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    vshl.i32 q3, q1, #3
-; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
+; CHECK-NEXT:    subs r7, #4
+; CHECK-NEXT:    add.w r10, r6, r7, lsr #2
+; CHECK-NEXT:    adr r7, .LCPI9_0
 ; CHECK-NEXT:    adr r6, .LCPI9_1
+; CHECK-NEXT:    vldrw.u32 q2, [r7]
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB9_1: @ %for.cond8.preheader.us.us.preheader
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB9_2 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB9_3 Depth 3
-; CHECK-NEXT:    mul r10, r8, r9
+; CHECK-NEXT:    mul r11, r8, r9
 ; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    mul r11, r8, r12
+; CHECK-NEXT:    mul r7, r8, r12
 ; CHECK-NEXT:  .LBB9_2: @ %vector.ph
 ; CHECK-NEXT:    @ Parent Loop BB9_1 Depth=1
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
 ; CHECK-NEXT:    @ Child Loop BB9_3 Depth 3
-; CHECK-NEXT:    vdup.32 q5, r11
+; CHECK-NEXT:    vdup.32 q5, r7
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vshl.i32 q5, q5, #2
 ; CHECK-NEXT:    vmov q6, q1
 ; CHECK-NEXT:    vadd.i32 q5, q5, r0
+; CHECK-NEXT:    dls lr, r10
 ; CHECK-NEXT:    vmov.i32 q4, #0x0
 ; CHECK-NEXT:    vadd.i32 q5, q5, q0
 ; CHECK-NEXT:    vmlas.u32 q6, q2, r5
-; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB9_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB9_1 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=2
@@ -493,11 +493,11 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32*
 ; CHECK-NEXT:    le lr, .LBB9_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=2
-; CHECK-NEXT:    add.w r6, r5, r10
+; CHECK-NEXT:    add.w r4, r5, r11
 ; CHECK-NEXT:    adds r5, #1
-; CHECK-NEXT:    vaddv.u32 r4, q4
+; CHECK-NEXT:    vaddv.u32 r6, q4
 ; CHECK-NEXT:    cmp r5, r9
-; CHECK-NEXT:    str.w r4, [r2, r6, lsl #2]
+; CHECK-NEXT:    str.w r6, [r2, r4, lsl #2]
 ; CHECK-NEXT:    bne .LBB9_2
 ; CHECK-NEXT:  @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB9_1 Depth=1
@@ -596,7 +596,7 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    .pad #24
 ; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    strd r0, r2, [sp, #16] @ 8-byte Folded Spill
+; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    mov r0, r3
@@ -610,38 +610,39 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:  .LBB10_2: @ %for.cond1.preheader.us.preheader
-; CHECK-NEXT:    ldr.w r11, [sp, #116]
+; CHECK-NEXT:    ldr.w r9, [sp, #116]
 ; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    movs r1, #1
-; CHECK-NEXT:    mov.w r9, #0
-; CHECK-NEXT:    bic r10, r11, #3
+; CHECK-NEXT:    mov r11, r2
+; CHECK-NEXT:    bic r10, r9, #3
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    sub.w r0, r10, #4
-; CHECK-NEXT:    add.w r8, r1, r0, lsr #2
+; CHECK-NEXT:    add.w r0, r1, r0, lsr #2
 ; CHECK-NEXT:    ldr r1, [sp, #112]
-; CHECK-NEXT:    lsl.w r0, r11, #1
+; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    lsl.w r0, r9, #1
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adr r0, .LCPI10_0
 ; CHECK-NEXT:    vdup.32 q4, r1
 ; CHECK-NEXT:    vldrw.u32 q5, [r0]
 ; CHECK-NEXT:    lsls r4, r1, #1
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vshl.i32 q6, q4, #2
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB10_5
 ; CHECK-NEXT:  .LBB10_3: @ %for.cond5.preheader.us73.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add.w r0, r11, r12, lsl #1
 ; CHECK-NEXT:    mov r1, r4
-; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    bl __aeabi_memclr
 ; CHECK-NEXT:  .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
-; CHECK-NEXT:    add r9, r11
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    add r8, r9
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    add r1, r0
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    adds r1, #1
@@ -653,7 +654,7 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    @ Child Loop BB10_11 Depth 3
 ; CHECK-NEXT:    @ Child Loop BB10_14 Depth 3
 ; CHECK-NEXT:    ldr r0, [sp, #112]
-; CHECK-NEXT:    cmp.w r11, #0
+; CHECK-NEXT:    cmp.w r9, #0
 ; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    mul r12, r1, r0
 ; CHECK-NEXT:    beq .LBB10_3
@@ -663,31 +664,31 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    b .LBB10_8
 ; CHECK-NEXT:  .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #112]
 ; CHECK-NEXT:    add.w r3, r1, r12
 ; CHECK-NEXT:    adds r1, #1
-; CHECK-NEXT:    strh.w r2, [r0, r3, lsl #1]
-; CHECK-NEXT:    ldr r0, [sp, #112]
 ; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    strh.w r2, [r11, r3, lsl #1]
 ; CHECK-NEXT:    beq .LBB10_4
 ; CHECK-NEXT:  .LBB10_8: @ %for.cond5.preheader.us.us
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
 ; CHECK-NEXT:    @ Child Loop BB10_11 Depth 3
 ; CHECK-NEXT:    @ Child Loop BB10_14 Depth 3
-; CHECK-NEXT:    cmp.w r11, #3
+; CHECK-NEXT:    cmp.w r9, #3
 ; CHECK-NEXT:    bhi .LBB10_10
 ; CHECK-NEXT:  @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    b .LBB10_13
 ; CHECK-NEXT:  .LBB10_10: @ %vector.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmov q1, q4
-; CHECK-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vmlas.u32 q1, q5, r1
-; CHECK-NEXT:    dls lr, r8
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB10_11: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
@@ -702,18 +703,18 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:  @ %bb.12: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
 ; CHECK-NEXT:    vaddv.u32 r2, q0
-; CHECK-NEXT:    cmp r10, r11
-; CHECK-NEXT:    mov r5, r10
+; CHECK-NEXT:    cmp r10, r9
+; CHECK-NEXT:    mov r7, r10
 ; CHECK-NEXT:    beq .LBB10_7
 ; CHECK-NEXT:  .LBB10_13: @ %for.body8.us.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
 ; CHECK-NEXT:    ldr r0, [sp, #112]
-; CHECK-NEXT:    sub.w lr, r11, r5
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    mla r3, r0, r5, r1
-; CHECK-NEXT:    add r5, r9
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    add.w r5, r8, r7
+; CHECK-NEXT:    mla r3, r0, r7, r1
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    sub.w r7, r9, r7
 ; CHECK-NEXT:    add.w r5, r0, r5, lsl #1
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:    add.w r3, r6, r3, lsl #1
 ; CHECK-NEXT:  .LBB10_14: @ %for.body8.us.us
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
@@ -850,54 +851,54 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    ldrd r2, r7, [sp, #104]
-; CHECK-NEXT:    add.w r12, r7, #10
+; CHECK-NEXT:    add.w r8, r7, #10
 ; CHECK-NEXT:    adr r7, .LCPI11_0
 ; CHECK-NEXT:    ldr r1, [sp, #96]
 ; CHECK-NEXT:    vdup.32 q1, r2
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
-; CHECK-NEXT:    mov.w r9, #0
-; CHECK-NEXT:    mov.w r10, #11
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    mov.w r9, #6
+; CHECK-NEXT:    movs r6, #11
 ; CHECK-NEXT:    vshl.i32 q1, q1, #2
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:  .LBB11_1: @ %for.body10.i
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB11_2 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB11_3 Depth 3
 ; CHECK-NEXT:    @ Child Loop BB11_4 Depth 4
 ; CHECK-NEXT:    @ Child Loop BB11_5 Depth 5
-; CHECK-NEXT:    mov.w r8, #0
-; CHECK-NEXT:    str r6, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB11_2: @ %for.cond22.preheader.i
 ; CHECK-NEXT:    @ Parent Loop BB11_1 Depth=1
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
 ; CHECK-NEXT:    @ Child Loop BB11_3 Depth 3
 ; CHECK-NEXT:    @ Child Loop BB11_4 Depth 4
 ; CHECK-NEXT:    @ Child Loop BB11_5 Depth 5
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:  .LBB11_3: @ %for.body27.i
 ; CHECK-NEXT:    @ Parent Loop BB11_1 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=2
 ; CHECK-NEXT:    @ => This Loop Header: Depth=3
 ; CHECK-NEXT:    @ Child Loop BB11_4 Depth 4
 ; CHECK-NEXT:    @ Child Loop BB11_5 Depth 5
-; CHECK-NEXT:    mov.w lr, #6
-; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    movs r5, #4
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r9
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov.w r11, #4
 ; CHECK-NEXT:  .LBB11_4: @ %for.body78.us.i
 ; CHECK-NEXT:    @ Parent Loop BB11_1 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=2
 ; CHECK-NEXT:    @ Parent Loop BB11_3 Depth=3
 ; CHECK-NEXT:    @ => This Loop Header: Depth=4
 ; CHECK-NEXT:    @ Child Loop BB11_5 Depth 5
-; CHECK-NEXT:    mul r7, r5, r10
-; CHECK-NEXT:    vdup.32 q3, r6
-; CHECK-NEXT:    vdup.32 q2, r8
-; CHECK-NEXT:    mov r11, r12
-; CHECK-NEXT:    vadd.i32 q4, q0, r7
+; CHECK-NEXT:    mul r4, r11, r6
+; CHECK-NEXT:    vdup.32 q3, r5
+; CHECK-NEXT:    vdup.32 q2, r7
+; CHECK-NEXT:    vadd.i32 q4, q0, r4
 ; CHECK-NEXT:    vmla.u32 q3, q4, r2
-; CHECK-NEXT:    adds r7, #113
-; CHECK-NEXT:    vadd.i32 q4, q0, r7
+; CHECK-NEXT:    adds r4, #113
+; CHECK-NEXT:    vadd.i32 q4, q0, r4
+; CHECK-NEXT:    mov r4, r8
 ; CHECK-NEXT:    vmla.u32 q2, q4, r2
 ; CHECK-NEXT:  .LBB11_5: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB11_1 Depth=1
@@ -908,36 +909,36 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
 ; CHECK-NEXT:    vldrb.s32 q6, [r0, q2]
 ; CHECK-NEXT:    vadd.i32 q5, q2, q1
 ; CHECK-NEXT:    vadd.i32 q4, q3, q1
-; CHECK-NEXT:    subs.w r11, r11, #4
+; CHECK-NEXT:    subs r4, #4
 ; CHECK-NEXT:    vadd.i32 q2, q6, r2
 ; CHECK-NEXT:    vldrb.s32 q6, [r1, q3]
 ; CHECK-NEXT:    vmov q3, q4
-; CHECK-NEXT:    vmlava.u32 r4, q2, q6
+; CHECK-NEXT:    vmlava.u32 r12, q2, q6
 ; CHECK-NEXT:    vmov q2, q5
 ; CHECK-NEXT:    bne .LBB11_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB11_4 Depth=4
-; CHECK-NEXT:    adds r5, #1
+; CHECK-NEXT:    add.w r11, r11, #1
 ; CHECK-NEXT:    le lr, .LBB11_4
 ; CHECK-NEXT:  @ %bb.7: @ %for.cond.cleanup77.i
 ; CHECK-NEXT:    @ in Loop: Header=BB11_3 Depth=3
-; CHECK-NEXT:    adds r6, #1
-; CHECK-NEXT:    add.w r9, r9, #1
-; CHECK-NEXT:    cmp r6, r2
+; CHECK-NEXT:    adds r5, #1
+; CHECK-NEXT:    add.w r10, r10, #1
+; CHECK-NEXT:    cmp r5, r2
 ; CHECK-NEXT:    bne .LBB11_3
 ; CHECK-NEXT:  @ %bb.8: @ %for.cond.cleanup26.i
 ; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=2
-; CHECK-NEXT:    add.w r8, r8, #1
-; CHECK-NEXT:    cmp r8, r3
+; CHECK-NEXT:    adds r7, #1
+; CHECK-NEXT:    cmp r7, r3
 ; CHECK-NEXT:    bne .LBB11_2
 ; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup20.i
 ; CHECK-NEXT:    @ in Loop: Header=BB11_1 Depth=1
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r7, [sp, #148]
-; CHECK-NEXT:    adds r6, #1
-; CHECK-NEXT:    cmp r6, r7
+; CHECK-NEXT:    adds r5, #1
+; CHECK-NEXT:    cmp r5, r7
 ; CHECK-NEXT:    it eq
-; CHECK-NEXT:    moveq r6, #0
+; CHECK-NEXT:    moveq r5, #0
 ; CHECK-NEXT:    b .LBB11_1
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.10:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
index bfc64b8c8e261..5b9f1fd215409 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
@@ -7,10 +7,10 @@ define void @ptr_iv_v4i32(i32* noalias nocapture readonly %A, i32* noalias nocap
 ; CHECK:       @ %bb.0: @ %vector.ph
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #249
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    adr r3, .LCPI0_0
-; CHECK-NEXT:    mov.w lr, #249
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, q0, uxtw #2]
@@ -57,12 +57,12 @@ define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias
 ; CHECK:       @ %bb.0: @ %vector.ph
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adr r1, .LCPI1_0
+; CHECK-NEXT:    movs r1, #249
 ; CHECK-NEXT:    adr r3, .LCPI1_1
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    adr r1, .LCPI1_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    mov.w lr, #249
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, q0, uxtw #2]
@@ -115,10 +115,10 @@ define void @ptr_iv_v8i16(i16* noalias nocapture readonly %A, i16* noalias nocap
 ; CHECK:       @ %bb.0: @ %vector.ph
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #249
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    adr r3, .LCPI2_0
-; CHECK-NEXT:    mov.w lr, #249
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0, q0, uxtw #1]
@@ -170,12 +170,12 @@ define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias
 ; CHECK:       @ %bb.0: @ %vector.ph
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #249
 ; CHECK-NEXT:    adr.w r12, .LCPI3_0
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    adr r3, .LCPI3_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vldrw.u32 q1, [r12]
-; CHECK-NEXT:    mov.w lr, #249
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q2, [r0, q0, uxtw #1]
@@ -237,10 +237,10 @@ define void @ptr_iv_v16i8(i8* noalias nocapture readonly %A, i8* noalias nocaptu
 ; CHECK:       @ %bb.0: @ %vector.ph
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #249
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    adr r3, .LCPI4_0
-; CHECK-NEXT:    mov.w lr, #249
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q1, [r0, q0]
@@ -300,12 +300,12 @@ define void @ptr_iv_v16i8_mult(i8* noalias nocapture readonly %A, i8* noalias no
 ; CHECK:       @ %bb.0: @ %vector.ph
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #249
 ; CHECK-NEXT:    adr.w r12, .LCPI5_0
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    adr r3, .LCPI5_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vldrw.u32 q1, [r12]
-; CHECK-NEXT:    mov.w lr, #249
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q2, [r0, q0]
@@ -383,10 +383,10 @@ define void @ptr_iv_v4f32(float* noalias nocapture readonly %A, float* noalias n
 ; CHECK:       @ %bb.0: @ %vector.ph
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #249
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    adr r3, .LCPI6_0
-; CHECK-NEXT:    mov.w lr, #249
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, q0, uxtw #2]
@@ -433,12 +433,12 @@ define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noal
 ; CHECK:       @ %bb.0: @ %vector.ph
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adr r1, .LCPI7_0
+; CHECK-NEXT:    movs r1, #249
 ; CHECK-NEXT:    adr r3, .LCPI7_1
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    adr r1, .LCPI7_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    mov.w lr, #249
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, q0, uxtw #2]
@@ -492,12 +492,12 @@ define void @ptr_iv_v8f16(half* noalias nocapture readonly %A, half* noalias noc
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov s0, r2
-; CHECK-NEXT:    adr r3, .LCPI8_0
+; CHECK-NEXT:    movs r3, #249
 ; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-NEXT:    mov.w lr, #249
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    vmov.f16 r2, s0
+; CHECK-NEXT:    adr r3, .LCPI8_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0, q0, uxtw #1]
@@ -550,13 +550,13 @@ define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalia
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov s0, r2
-; CHECK-NEXT:    adr r2, .LCPI9_0
+; CHECK-NEXT:    movs r2, #249
 ; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-NEXT:    mov.w lr, #249
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    vmov.f16 r1, s0
+; CHECK-NEXT:    adr r2, .LCPI9_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
 ; CHECK-NEXT:    adr r2, .LCPI9_1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q1, [r2]
 ; CHECK-NEXT:  .LBB9_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
index c1814036a970e..4df2ea3eb89b1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
@@ -7,11 +7,11 @@ define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32*
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
+; CHECK-NEXT:    movw r0, #1250
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    adr r0, .LCPI0_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    adds r1, r3, #4
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
@@ -79,13 +79,13 @@ define dso_local void @mve_gatherscatter_offset(i32* noalias nocapture readonly
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    add.w r4, r0, r3, lsl #2
+; CHECK-NEXT:    movw r0, #1250
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    adr r0, .LCPI1_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    add.w r12, r3, #4
-; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vmov.i32 q0, #0x14
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
@@ -153,14 +153,14 @@ define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* n
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
+; CHECK-NEXT:    movw r0, #1250
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    adr r0, .LCPI2_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vmov.i32 q2, #0x3
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    adds r1, r3, #4
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
@@ -239,9 +239,9 @@ define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocaptur
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    adr r6, .LCPI3_4
 ; CHECK-NEXT:    adr r5, .LCPI3_3
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
 ; CHECK-NEXT:    adr r4, .LCPI3_2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #160] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    adr.w r8, .LCPI3_1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
index bad1402434f2b..b7e1c340fc5e5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
@@ -20,9 +20,9 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read
 ; CHECK-NEXT:  .LBB0_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:  .LBB0_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll b/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll
index ac144dd414840..a4a41068a5c1a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll
@@ -9,26 +9,25 @@ define void @tailpred(half* nocapture readonly %pSrcA, half* nocapture readonly
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq .LBB0_6
 ; CHECK-NEXT:  @ %bb.1: @ %vector.memcheck
-; CHECK-NEXT:    add.w r4, r2, r3, lsl #1
-; CHECK-NEXT:    add.w r5, r1, r3, lsl #1
-; CHECK-NEXT:    cmp r4, r1
+; CHECK-NEXT:    add.w r5, r2, r3, lsl #1
+; CHECK-NEXT:    add.w r4, r1, r3, lsl #1
+; CHECK-NEXT:    cmp r5, r1
 ; CHECK-NEXT:    cset r12, hi
-; CHECK-NEXT:    cmp r5, r2
+; CHECK-NEXT:    cmp r4, r2
 ; CHECK-NEXT:    cset lr, hi
-; CHECK-NEXT:    cmp r4, r0
+; CHECK-NEXT:    cmp r5, r0
 ; CHECK-NEXT:    add.w r5, r0, r3, lsl #1
 ; CHECK-NEXT:    cset r4, hi
 ; CHECK-NEXT:    cmp r5, r2
 ; CHECK-NEXT:    cset r5, hi
 ; CHECK-NEXT:    ands r4, r5
 ; CHECK-NEXT:    lsls r4, r4, #31
-; CHECK-NEXT:    mov r4, r3
 ; CHECK-NEXT:    itt eq
-; CHECK-NEXT:    andeq.w r3, lr, r12
-; CHECK-NEXT:    lslseq.w r3, r3, #31
+; CHECK-NEXT:    andeq.w r5, lr, r12
+; CHECK-NEXT:    lslseq.w r5, r5, #31
 ; CHECK-NEXT:    beq .LBB0_4
 ; CHECK-NEXT:  @ %bb.2: @ %while.body.preheader
-; CHECK-NEXT:    dls lr, r4
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB0_3: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr.16 s0, [r0]
@@ -41,29 +40,14 @@ define void @tailpred(half* nocapture readonly %pSrcA, half* nocapture readonly
 ; CHECK-NEXT:    le lr, .LBB0_3
 ; CHECK-NEXT:    b .LBB0_6
 ; CHECK-NEXT:  .LBB0_4: @ %vector.ph
-; CHECK-NEXT:    adds r3, r4, #7
-; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    bic r3, r3, #7
-; CHECK-NEXT:    subs r3, #8
-; CHECK-NEXT:    add.w r5, r5, r3, lsr #3
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    mov r5, r4
+; CHECK-NEXT:    dlstp.16 lr, r3
 ; CHECK-NEXT:  .LBB0_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    mov lr, r3
-; CHECK-NEXT:    vctp.16 r5
-; CHECK-NEXT:    sub.w lr, lr, #1
-; CHECK-NEXT:    subs r5, #8
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrht.u16 q0, [r0], #16
-; CHECK-NEXT:    vldrht.u16 q1, [r1], #16
-; CHECK-NEXT:    mov r3, lr
+; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
+; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrht.16 q0, [r2], #16
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    bne .LBB0_5
-; CHECK-NEXT:    b .LBB0_6
+; CHECK-NEXT:    vstrh.16 q0, [r2], #16
+; CHECK-NEXT:    letp lr, .LBB0_5
 ; CHECK-NEXT:  .LBB0_6: @ %while.end
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -128,73 +112,68 @@ while.end:                                        ; preds = %vector.body, %while
 define void @notailpred(half* nocapture readonly %pSrcA, half* nocapture readonly %pSrcB, half* nocapture %pDst, i32 %blockSize) {
 ; CHECK-LABEL: notailpred:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r8, r9, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    cbz r3, .LBB1_6
 ; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
 ; CHECK-NEXT:    cmp r3, #8
 ; CHECK-NEXT:    blo .LBB1_3
 ; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
-; CHECK-NEXT:    add.w r4, r2, r3, lsl #1
-; CHECK-NEXT:    add.w r5, r1, r3, lsl #1
-; CHECK-NEXT:    cmp r4, r1
-; CHECK-NEXT:    add.w r6, r0, r3, lsl #1
-; CHECK-NEXT:    cset r12, hi
-; CHECK-NEXT:    cmp r5, r2
-; CHECK-NEXT:    cset r5, hi
-; CHECK-NEXT:    cmp r4, r0
-; CHECK-NEXT:    cset r4, hi
+; CHECK-NEXT:    add.w r5, r2, r3, lsl #1
+; CHECK-NEXT:    add.w r6, r1, r3, lsl #1
+; CHECK-NEXT:    cmp r5, r1
+; CHECK-NEXT:    add.w r4, r0, r3, lsl #1
+; CHECK-NEXT:    cset r7, hi
 ; CHECK-NEXT:    cmp r6, r2
 ; CHECK-NEXT:    cset r6, hi
-; CHECK-NEXT:    ands r6, r4
-; CHECK-NEXT:    lsls r6, r6, #31
+; CHECK-NEXT:    cmp r5, r0
+; CHECK-NEXT:    cset r5, hi
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    cset r4, hi
+; CHECK-NEXT:    ands r5, r4
+; CHECK-NEXT:    lsls r5, r5, #31
 ; CHECK-NEXT:    itt eq
-; CHECK-NEXT:    andeq.w r6, r5, r12
-; CHECK-NEXT:    lslseq.w r6, r6, #31
+; CHECK-NEXT:    andeq r7, r6
+; CHECK-NEXT:    lslseq.w r7, r7, #31
 ; CHECK-NEXT:    beq .LBB1_7
 ; CHECK-NEXT:  .LBB1_3:
-; CHECK-NEXT:    mov lr, r3
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    mov r7, r2
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:  .LBB1_4: @ %while.body.preheader31
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r5
 ; CHECK-NEXT:  .LBB1_5: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr.16 s0, [r12]
-; CHECK-NEXT:    vldr.16 s2, [r5]
-; CHECK-NEXT:    adds r5, #2
+; CHECK-NEXT:    vldr.16 s2, [r4]
+; CHECK-NEXT:    adds r4, #2
 ; CHECK-NEXT:    add.w r12, r12, #2
 ; CHECK-NEXT:    vadd.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r4]
-; CHECK-NEXT:    adds r4, #2
+; CHECK-NEXT:    vstr.16 s0, [r7]
+; CHECK-NEXT:    adds r7, #2
 ; CHECK-NEXT:    le lr, .LBB1_5
 ; CHECK-NEXT:  .LBB1_6: @ %while.end
-; CHECK-NEXT:    pop.w {r4, r5, r6, r8, r9, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 ; CHECK-NEXT:  .LBB1_7: @ %vector.ph
-; CHECK-NEXT:    bic r8, r3, #7
-; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    sub.w r5, r8, #8
-; CHECK-NEXT:    and r9, r3, #7
-; CHECK-NEXT:    add.w r12, r0, r8, lsl #1
-; CHECK-NEXT:    add.w r5, r4, r5, lsr #3
-; CHECK-NEXT:    add.w r4, r2, r8, lsl #1
-; CHECK-NEXT:    mov r6, r5
-; CHECK-NEXT:    add.w r5, r1, r8, lsl #1
+; CHECK-NEXT:    bic r6, r3, #7
+; CHECK-NEXT:    movs r5, #1
+; CHECK-NEXT:    sub.w r7, r6, #8
+; CHECK-NEXT:    add.w r4, r1, r6, lsl #1
+; CHECK-NEXT:    add.w r12, r0, r6, lsl #1
+; CHECK-NEXT:    add.w r5, r5, r7, lsr #3
+; CHECK-NEXT:    add.w r7, r2, r6, lsl #1
+; CHECK-NEXT:    dls lr, r5
+; CHECK-NEXT:    and r5, r3, #7
 ; CHECK-NEXT:  .LBB1_8: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
-; CHECK-NEXT:    mov lr, r6
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
-; CHECK-NEXT:    subs.w lr, lr, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    mov r6, lr
-; CHECK-NEXT:    bne .LBB1_8
-; CHECK-NEXT:    b .LBB1_9
-; CHECK-NEXT:  .LBB1_9: @ %middle.block
-; CHECK-NEXT:    cmp r8, r3
-; CHECK-NEXT:    mov lr, r9
+; CHECK-NEXT:    le lr, .LBB1_8
+; CHECK-NEXT:  @ %bb.9: @ %middle.block
+; CHECK-NEXT:    cmp r6, r3
 ; CHECK-NEXT:    bne .LBB1_4
 ; CHECK-NEXT:    b .LBB1_6
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index d2819034c44bd..53d942c3decde 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -13,36 +13,43 @@ define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    cmp.w r12, #2
 ; CHECK-NEXT:    blo .LBB0_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr.w r9, [r0, #8]
+; CHECK-NEXT:    ldr r5, [r0, #8]
 ; CHECK-NEXT:    ldr r3, [r0]
-; CHECK-NEXT:    add.w r3, r3, r9, lsl #2
+; CHECK-NEXT:    adds r0, r5, #3
+; CHECK-NEXT:    bic r0, r0, #3
+; CHECK-NEXT:    add.w r4, r3, r5, lsl #2
+; CHECK-NEXT:    subs r3, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsl.w r8, r9, #2
+; CHECK-NEXT:    lsl.w r9, r5, #2
+; CHECK-NEXT:    add.w r8, r0, r3, lsr #2
 ; CHECK-NEXT:  .LBB0_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
+; CHECK-NEXT:    dls lr, r8
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    mov r6, r1
-; CHECK-NEXT:    mov r7, r3
-; CHECK-NEXT:    mov r5, r9
-; CHECK-NEXT:    dlstp.32 lr, r5
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    mov r6, r5
 ; CHECK-NEXT:  .LBB0_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrw.u32 q1, [r6], #16
-; CHECK-NEXT:    vldrw.u32 q2, [r7], #16
-; CHECK-NEXT:    vfma.f32 q0, q2, q1
-; CHECK-NEXT:    letp lr, .LBB0_3
+; CHECK-NEXT:    vctp.32 r6
+; CHECK-NEXT:    subs r6, #4
+; CHECK-NEXT:    vpsttt
+; CHECK-NEXT:    vldrwt.u32 q1, [r7], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r3], #16
+; CHECK-NEXT:    vfmat.f32 q0, q2, q1
+; CHECK-NEXT:    le lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s4, s2, s3
-; CHECK-NEXT:    add.w r7, r2, r0, lsl #2
+; CHECK-NEXT:    add.w r3, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    adds r0, #1
-; CHECK-NEXT:    add r3, r8
+; CHECK-NEXT:    add r4, r9
 ; CHECK-NEXT:    cmp r0, r12
 ; CHECK-NEXT:    vadd.f32 s0, s0, s4
-; CHECK-NEXT:    vstr s0, [r7]
+; CHECK-NEXT:    vstr s0, [r3]
 ; CHECK-NEXT:    bne .LBB0_2
 ; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
@@ -115,37 +122,45 @@ define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    ldr.w r12, [r0, #8]
 ; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    ldr r3, [r0]
-; CHECK-NEXT:    add.w r5, r3, r12, lsl #2
+; CHECK-NEXT:    add.w r0, r12, #3
+; CHECK-NEXT:    bic r0, r0, #3
+; CHECK-NEXT:    add.w r6, r3, r12, lsl #2
+; CHECK-NEXT:    subs r0, #4
 ; CHECK-NEXT:    add.w r7, r3, r12, lsl #3
-; CHECK-NEXT:    lsl.w r8, r12, #3
+; CHECK-NEXT:    lsl.w r10, r12, #3
+; CHECK-NEXT:    add.w r8, r4, r0, lsr #2
 ; CHECK-NEXT:  .LBB1_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_3 Depth 2
+; CHECK-NEXT:    dls lr, r8
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    ldr r6, [sp] @ 4-byte Reload
-; CHECK-NEXT:    add.w r10, r4, #1
-; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    ldr r5, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r11, r4, #1
+; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    mov r0, r7
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    dlstp.32 lr, r12
-; CHECK-NEXT:    mov r11, r12
+; CHECK-NEXT:    mov r9, r12
 ; CHECK-NEXT:  .LBB1_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB1_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrw.u32 q2, [r6], #16
-; CHECK-NEXT:    vldrw.u32 q3, [r3], #16
-; CHECK-NEXT:    vfma.f32 q1, q3, q2
-; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
-; CHECK-NEXT:    vfma.f32 q0, q3, q2
-; CHECK-NEXT:    letp lr, .LBB1_3
+; CHECK-NEXT:    vctp.32 r9
+; CHECK-NEXT:    sub.w r9, r9, #4
+; CHECK-NEXT:    vpstttt
+; CHECK-NEXT:    vldrwt.u32 q2, [r5], #16
+; CHECK-NEXT:    vldrwt.u32 q3, [r3], #16
+; CHECK-NEXT:    vfmat.f32 q1, q3, q2
+; CHECK-NEXT:    vldrwt.u32 q3, [r0], #16
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmat.f32 q0, q3, q2
+; CHECK-NEXT:    le lr, .LBB1_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s8, s2, s3
-; CHECK-NEXT:    add.w r0, r2, r10, lsl #2
+; CHECK-NEXT:    add.w r0, r2, r11, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    add r5, r8
+; CHECK-NEXT:    add r6, r10
 ; CHECK-NEXT:    vadd.f32 s2, s6, s7
-; CHECK-NEXT:    add r7, r8
+; CHECK-NEXT:    add r7, r10
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    vadd.f32 s0, s0, s8
 ; CHECK-NEXT:    vadd.f32 s2, s4, s2
@@ -242,40 +257,40 @@ define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo .LBB2_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr r7, [r0, #8]
 ; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
-; CHECK-NEXT:    add.w r7, r1, r3, lsl #2
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #3
-; CHECK-NEXT:    adds r3, #3
+; CHECK-NEXT:    ldr r3, [r0]
+; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r7, r7, lsl #1
+; CHECK-NEXT:    add.w r12, r3, r7, lsl #2
+; CHECK-NEXT:    add.w r1, r3, r7, lsl #3
+; CHECK-NEXT:    add.w r8, r3, r0, lsl #2
+; CHECK-NEXT:    adds r3, r7, #3
 ; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    add.w r1, r1, r0, lsl #2
+; CHECK-NEXT:    lsls r7, r0, #2
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    lsl.w r11, r0, #2
 ; CHECK-NEXT:    add.w r3, r5, r3, lsr #2
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB2_3 Depth 2
-; CHECK-NEXT:    ldr.w lr, [sp] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    add.w r9, r5, #2
-; CHECK-NEXT:    add.w r10, r5, #1
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    add.w r11, r5, #1
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    ldr.w r8, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    ldr.w r10, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r8
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:  .LBB2_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB2_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vctp.32 r8
-; CHECK-NEXT:    sub.w r8, r8, #4
+; CHECK-NEXT:    vctp.32 r10
+; CHECK-NEXT:    sub.w r10, r10, #4
 ; CHECK-NEXT:    vpstttt
 ; CHECK-NEXT:    vldrwt.u32 q3, [r6], #16
 ; CHECK-NEXT:    vldrwt.u32 q4, [r3], #16
@@ -289,13 +304,13 @@ define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB2_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s12, s10, s11
-; CHECK-NEXT:    add.w r0, r2, r10, lsl #2
+; CHECK-NEXT:    add.w r0, r2, r11, lsl #2
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    add r7, r11
+; CHECK-NEXT:    add r12, r7
 ; CHECK-NEXT:    vadd.f32 s10, s6, s7
-; CHECK-NEXT:    add r12, r11
+; CHECK-NEXT:    add r1, r7
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    add r1, r11
+; CHECK-NEXT:    add r8, r7
 ; CHECK-NEXT:    vadd.f32 s6, s2, s3
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s2, s8, s12
@@ -416,10 +431,10 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    ldr r1, [r0]
 ; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #2
-; CHECK-NEXT:    add.w r10, r1, r3, lsl #3
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #4
-; CHECK-NEXT:    add.w r8, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r12, r1, r3, lsl #3
+; CHECK-NEXT:    add.w r10, r1, r3, lsl #4
+; CHECK-NEXT:    add.w r9, r1, r0, lsl #2
 ; CHECK-NEXT:    adds r0, r3, #3
 ; CHECK-NEXT:    bic r0, r0, #3
 ; CHECK-NEXT:    lsls r7, r3, #4
@@ -429,22 +444,22 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:  .LBB3_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
-; CHECK-NEXT:    ldr.w lr, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    adds r0, r6, #3
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    mov r3, r12
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    mov r3, r8
+; CHECK-NEXT:    mov r5, r9
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    adds r0, r6, #3
 ; CHECK-NEXT:    str r0, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r6, #2
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r10
 ; CHECK-NEXT:    ldr.w r11, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    mov r5, r8
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    str r0, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r6, #1
 ; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    mov r0, r10
-; CHECK-NEXT:    mov r4, r9
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:  .LBB3_3: @ %vector.body
@@ -470,15 +485,15 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    vadd.f32 s16, s14, s15
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    add r12, r7
+; CHECK-NEXT:    add r8, r7
 ; CHECK-NEXT:    vadd.f32 s14, s10, s11
-; CHECK-NEXT:    add r10, r7
+; CHECK-NEXT:    add r12, r7
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s10, s6, s7
-; CHECK-NEXT:    add r8, r7
-; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    add r9, r7
+; CHECK-NEXT:    vadd.f32 s4, s4, s5
+; CHECK-NEXT:    add r10, r7
 ; CHECK-NEXT:    vadd.f32 s6, s2, s3
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s2, s12, s16
@@ -618,7 +633,7 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    adds r0, r3, #3
 ; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    lsls r5, r3, #2
@@ -630,51 +645,51 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:  .LBB4_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB4_3 Depth 2
-; CHECK-NEXT:    ldr.w lr, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    adds r1, r0, #4
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    add.w r10, r0, #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    adds r7, r0, #1
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r11, r0, #1
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    mov r3, r8
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    ldr.w r11, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vmov q2, q1
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:  .LBB4_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB4_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    add.w r8, r3, r5
-; CHECK-NEXT:    vctp.32 r1
+; CHECK-NEXT:    add.w r9, r3, r5
+; CHECK-NEXT:    vctp.32 r11
 ; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vldrwt.u32 q5, [r6], #16
+; CHECK-NEXT:    vldrwt.u32 q5, [r1], #16
 ; CHECK-NEXT:    vldrwt.u32 q6, [r3], #16
 ; CHECK-NEXT:    vfmat.f32 q3, q6, q5
-; CHECK-NEXT:    add.w r9, r8, r5
+; CHECK-NEXT:    add.w r12, r9, r5
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q6, [r8]
+; CHECK-NEXT:    vldrwt.u32 q6, [r9]
 ; CHECK-NEXT:    vfmat.f32 q4, q6, q5
-; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    add.w r4, r9, r5
+; CHECK-NEXT:    sub.w r11, r11, #4
+; CHECK-NEXT:    add.w r4, r12, r5
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q6, [r9]
+; CHECK-NEXT:    vldrwt.u32 q6, [r12]
 ; CHECK-NEXT:    vfmat.f32 q2, q6, q5
-; CHECK-NEXT:    adds r7, r4, r5
+; CHECK-NEXT:    adds r6, r4, r5
 ; CHECK-NEXT:    vpstttt
 ; CHECK-NEXT:    vldrwt.u32 q6, [r4]
 ; CHECK-NEXT:    vfmat.f32 q0, q6, q5
-; CHECK-NEXT:    vldrwt.u32 q6, [r7]
+; CHECK-NEXT:    vldrwt.u32 q6, [r6]
 ; CHECK-NEXT:    vfmat.f32 q1, q6, q5
 ; CHECK-NEXT:    le lr, .LBB4_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB4_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s20, s18, s19
-; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
+; CHECK-NEXT:    add.w r1, r2, r7, lsl #2
 ; CHECK-NEXT:    vadd.f32 s16, s16, s17
 ; CHECK-NEXT:    vadd.f32 s18, s14, s15
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
@@ -702,7 +717,7 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
 ; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    add r12, r1
+; CHECK-NEXT:    add r8, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB4_2
@@ -839,7 +854,7 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    adds r0, r3, #3
 ; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    lsls r5, r3, #2
@@ -851,21 +866,21 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:  .LBB5_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB5_3 Depth 2
-; CHECK-NEXT:    ldr.w lr, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    adds r1, r0, #5
+; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    add.w r11, r0, #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    adds r4, r0, #1
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    adds r1, r0, #5
 ; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    adds r6, r0, #1
-; CHECK-NEXT:    ldr.w r12, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    ldr.w r8, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vmov q5, q1
@@ -873,36 +888,36 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:  .LBB5_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB5_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    add.w r9, r3, r5
-; CHECK-NEXT:    vctp.32 r12
+; CHECK-NEXT:    add.w r12, r3, r5
+; CHECK-NEXT:    vctp.32 r8
 ; CHECK-NEXT:    vpsttt
 ; CHECK-NEXT:    vldrwt.u32 q6, [r1], #16
 ; CHECK-NEXT:    vldrwt.u32 q7, [r3], #16
 ; CHECK-NEXT:    vfmat.f32 q4, q7, q6
-; CHECK-NEXT:    add.w r10, r9, r5
+; CHECK-NEXT:    add.w r10, r12, r5
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q7, [r9]
+; CHECK-NEXT:    vldrwt.u32 q7, [r12]
 ; CHECK-NEXT:    vfmat.f32 q5, q7, q6
-; CHECK-NEXT:    add.w r4, r10, r5
+; CHECK-NEXT:    add.w r6, r10, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q7, [r10]
 ; CHECK-NEXT:    vfmat.f32 q2, q7, q6
-; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    adds r7, r4, r5
+; CHECK-NEXT:    sub.w r8, r8, #4
+; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q7, [r4]
+; CHECK-NEXT:    vldrwt.u32 q7, [r6]
 ; CHECK-NEXT:    vfmat.f32 q0, q7, q6
-; CHECK-NEXT:    adds r4, r7, r5
+; CHECK-NEXT:    adds r6, r7, r5
 ; CHECK-NEXT:    vpstttt
 ; CHECK-NEXT:    vldrwt.u32 q7, [r7]
 ; CHECK-NEXT:    vfmat.f32 q3, q7, q6
-; CHECK-NEXT:    vldrwt.u32 q7, [r4]
+; CHECK-NEXT:    vldrwt.u32 q7, [r6]
 ; CHECK-NEXT:    vfmat.f32 q1, q7, q6
 ; CHECK-NEXT:    le lr, .LBB5_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB5_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s24, s22, s23
-; CHECK-NEXT:    add.w r1, r2, r6, lsl #2
+; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vadd.f32 s20, s20, s21
 ; CHECK-NEXT:    vadd.f32 s22, s18, s19
 ; CHECK-NEXT:    vadd.f32 s16, s16, s17
@@ -936,7 +951,7 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
 ; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    add r8, r1
+; CHECK-NEXT:    add r9, r1
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB5_2
@@ -1098,11 +1113,12 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:  .LBB6_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB6_3 Depth 2
-; CHECK-NEXT:    ldr.w lr, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    adds r1, r0, #6
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    adds r6, r0, #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    adds r4, r0, #2
+; CHECK-NEXT:    add.w r8, r0, #1
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    adds r1, r0, #6
 ; CHECK-NEXT:    str r1, [sp, #44] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #5
 ; CHECK-NEXT:    str r1, [sp, #40] @ 4-byte Spill
@@ -1110,11 +1126,10 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    add.w r8, r0, #1
-; CHECK-NEXT:    ldr.w r9, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    ldr.w r9, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmov q5, q2
 ; CHECK-NEXT:    vmov q3, q2
 ; CHECK-NEXT:    vmov q6, q2
@@ -1124,9 +1139,9 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    add.w r10, r3, r5
-; CHECK-NEXT:    vctp.32 r9
+; CHECK-NEXT:    vctp.32 r1
 ; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vldrwt.u32 q7, [r1], #16
+; CHECK-NEXT:    vldrwt.u32 q7, [r9], #16
 ; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
 ; CHECK-NEXT:    vfmat.f32 q5, q0, q7
 ; CHECK-NEXT:    add.w r11, r10, r5
@@ -1137,16 +1152,16 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q0, [r11]
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
-; CHECK-NEXT:    add.w r4, r11, r5
+; CHECK-NEXT:    add.w r6, r11, r5
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vmov q5, q4
 ; CHECK-NEXT:    vmov q4, q2
 ; CHECK-NEXT:    vmov q2, q3
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r4]
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r4, r5
+; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
@@ -1156,14 +1171,14 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    vmov q4, q5
 ; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    sub.w r9, r9, #4
-; CHECK-NEXT:    adds r4, r7, r5
+; CHECK-NEXT:    subs r1, #4
+; CHECK-NEXT:    adds r6, r7, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q0, [r7]
 ; CHECK-NEXT:    vfmat.f32 q3, q0, q7
-; CHECK-NEXT:    adds r7, r4, r5
+; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vldrwt.u32 q0, [r4]
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
 ; CHECK-NEXT:    vfmat.f32 q4, q0, q7
 ; CHECK-NEXT:    vldrwt.u32 q0, [r7]
 ; CHECK-NEXT:    vfmat.f32 q2, q0, q7
@@ -1195,7 +1210,7 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    adds r0, #7
 ; CHECK-NEXT:    vadd.f32 s10, s9, s10
 ; CHECK-NEXT:    vstr s2, [r1]
-; CHECK-NEXT:    add.w r1, r2, r6, lsl #2
+; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vadd.f32 s8, s8, s20
 ; CHECK-NEXT:    vadd.f32 s6, s5, s6
 ; CHECK-NEXT:    vstr s4, [r1]
@@ -1386,11 +1401,12 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:  .LBB7_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB7_3 Depth 2
-; CHECK-NEXT:    ldr.w lr, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    adds r4, r0, #3
+; CHECK-NEXT:    add.w r8, r0, #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    adds r1, r0, #7
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    adds r6, r0, #3
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    str r1, [sp, #44] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #6
 ; CHECK-NEXT:    str r1, [sp, #40] @ 4-byte Spill
@@ -1398,18 +1414,17 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    ldr.w r12, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    ldr.w r10, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r8, r0, #2
+; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
 ; CHECK-NEXT:    adds r1, r0, #1
-; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov q7, q2
-; CHECK-NEXT:    vmov q3, q2
-; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vmov q6, q3
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB7_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB7_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -1419,95 +1434,95 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    vldrwt.u32 q0, [r12], #16
 ; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
 ; CHECK-NEXT:    vfmat.f32 q6, q1, q0
-; CHECK-NEXT:    add.w r4, r11, r5
+; CHECK-NEXT:    add.w r6, r11, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q1, [r11]
 ; CHECK-NEXT:    vfmat.f32 q7, q1, q0
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q7, q6
 ; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmov q5, q4
-; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q1, [r4]
-; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r4, r5
+; CHECK-NEXT:    vldrwt.u32 q1, [r6]
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vfmat.f32 q3, q1, q0
-; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vfmat.f32 q2, q1, q0
+; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vfmat.f32 q3, q1, q0
-; CHECK-NEXT:    adds r4, r7, r5
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vfmat.f32 q2, q1, q0
+; CHECK-NEXT:    adds r6, r7, r5
+; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vmov q4, q5
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    vmov q6, q7
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r4, r5
+; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q1, [r4]
-; CHECK-NEXT:    vfmat.f32 q3, q1, q0
+; CHECK-NEXT:    vldrwt.u32 q1, [r6]
+; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    sub.w r10, r10, #4
-; CHECK-NEXT:    adds r4, r7, r5
+; CHECK-NEXT:    adds r6, r7, r5
 ; CHECK-NEXT:    vpstttt
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    vfmat.f32 q4, q1, q0
-; CHECK-NEXT:    vldrwt.u32 q1, [r4]
+; CHECK-NEXT:    vldrwt.u32 q1, [r6]
 ; CHECK-NEXT:    vfmat.f32 q5, q1, q0
-; CHECK-NEXT:    add r4, r5
+; CHECK-NEXT:    add r6, r5
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q1, [r4]
-; CHECK-NEXT:    vfmat.f32 q2, q1, q0
+; CHECK-NEXT:    vldrwt.u32 q1, [r6]
+; CHECK-NEXT:    vfmat.f32 q3, q1, q0
 ; CHECK-NEXT:    le lr, .LBB7_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB7_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s0, s30, s31
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vadd.f32 s2, s28, s29
-; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s5, s10, s11
+; CHECK-NEXT:    vadd.f32 s12, s12, s13
+; CHECK-NEXT:    vadd.f32 s5, s14, s15
 ; CHECK-NEXT:    vadd.f32 s4, s26, s27
 ; CHECK-NEXT:    vadd.f32 s6, s24, s25
-; CHECK-NEXT:    vadd.f32 s10, s18, s19
+; CHECK-NEXT:    vadd.f32 s14, s18, s19
 ; CHECK-NEXT:    vadd.f32 s7, s16, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f32 s9, s14, s15
-; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vadd.f32 s14, s18, s19
-; CHECK-NEXT:    vadd.f32 s11, s16, s17
+; CHECK-NEXT:    vadd.f32 s8, s8, s9
+; CHECK-NEXT:    vadd.f32 s13, s10, s11
+; CHECK-NEXT:    vadd.f32 s10, s18, s19
+; CHECK-NEXT:    vadd.f32 s9, s16, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.f32 s0, s2, s0
-; CHECK-NEXT:    vadd.f32 s13, s18, s19
+; CHECK-NEXT:    vadd.f32 s11, s18, s19
 ; CHECK-NEXT:    vadd.f32 s15, s16, s17
 ; CHECK-NEXT:    vadd.f32 s2, s6, s4
-; CHECK-NEXT:    vadd.f32 s6, s8, s5
-; CHECK-NEXT:    vadd.f32 s8, s7, s10
-; CHECK-NEXT:    vadd.f32 s10, s12, s9
-; CHECK-NEXT:    vadd.f32 s12, s11, s14
+; CHECK-NEXT:    vadd.f32 s6, s12, s5
+; CHECK-NEXT:    vadd.f32 s12, s7, s14
+; CHECK-NEXT:    vadd.f32 s10, s9, s10
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s1, s22, s23
-; CHECK-NEXT:    vadd.f32 s14, s15, s13
+; CHECK-NEXT:    vadd.f32 s8, s8, s13
 ; CHECK-NEXT:    adds r0, #8
+; CHECK-NEXT:    vadd.f32 s14, s15, s11
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
+; CHECK-NEXT:    vadd.f32 s1, s22, s23
 ; CHECK-NEXT:    vadd.f32 s3, s20, s21
-; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    add.w r1, r2, r6, lsl #2
+; CHECK-NEXT:    vstr s10, [r1]
+; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vstr s14, [r1]
 ; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s4, s3, s1
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
-; CHECK-NEXT:    vstr s10, [r1]
+; CHECK-NEXT:    vstr s8, [r1]
 ; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
-; CHECK-NEXT:    vstr s8, [r1]
+; CHECK-NEXT:    vstr s12, [r1]
 ; CHECK-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
index 2db5bf59ecfae..2686748555db8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
@@ -8,18 +8,17 @@ define i32 @vaddv(i32* nocapture readonly %data, i32 %N) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    mov lr, r1
 ; CHECK-NEXT:    cmp r1, #1
 ; CHECK-NEXT:    blt .LBB0_4
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    mov r2, r0
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r1], #32
+; CHECK-NEXT:    vldrw.u32 q0, [r2], #32
 ; CHECK-NEXT:    vaddva.s32 r0, q0
-; CHECK-NEXT:    vldrw.u32 q0, [r1, #-16]
+; CHECK-NEXT:    vldrw.u32 q0, [r2, #-16]
 ; CHECK-NEXT:    vaddva.s32 r0, q0
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
@@ -283,10 +282,10 @@ define void @fma8(float* noalias nocapture readonly %A, float* noalias nocapture
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #8
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #3
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #3
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r4, #16]
@@ -305,11 +304,11 @@ define void @fma8(float* noalias nocapture readonly %A, float* noalias nocapture
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB2_6: @ %for.body.preheader12
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB2_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr s0, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index f87beba4a2de6..0605d50883f41 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -22,10 +22,10 @@ define void @fma(float* noalias nocapture readonly %A, float* noalias nocapture
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    add.w r6, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r4], #16
@@ -39,11 +39,11 @@ define void @fma(float* noalias nocapture readonly %A, float* noalias nocapture
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader12
-; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    sub.w r3, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
 ; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr s0, [r0]
@@ -134,11 +134,11 @@ define void @fma_tailpred(float* noalias nocapture readonly %A, float* noalias n
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #2
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vdup.32 q2, r3
@@ -218,9 +218,9 @@ define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %inpu
 ; CHECK-NEXT:    beq .LBB2_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
 ; CHECK-NEXT:    ldr r3, [sp, #64]
-; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r9, [sp, #56]
+; CHECK-NEXT:    ldr.w r11, [sp, #56]
 ; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r3
@@ -232,54 +232,54 @@ define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %inpu
 ; CHECK-NEXT:    lsrs r0, r0, #3
 ; CHECK-NEXT:    b .LBB2_5
 ; CHECK-NEXT:  .LBB2_3: @ in Loop: Header=BB2_5 Depth=1
-; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:  .LBB2_4: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    ldr r3, [sp, #72]
-; CHECK-NEXT:    add.w r1, r10, r8
+; CHECK-NEXT:    add.w r1, r8, r10
 ; CHECK-NEXT:    add r1, r6
 ; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    strb.w r1, [r3, r11]
-; CHECK-NEXT:    add.w r11, r11, #1
-; CHECK-NEXT:    cmp r11, r2
+; CHECK-NEXT:    strb.w r1, [r3, r9]
+; CHECK-NEXT:    add.w r9, r9, #1
+; CHECK-NEXT:    cmp r9, r2
 ; CHECK-NEXT:    beq .LBB2_8
 ; CHECK-NEXT:  .LBB2_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB2_7 Depth 2
 ; CHECK-NEXT:    ldr r1, [sp, #68]
-; CHECK-NEXT:    subs.w lr, r0, r0
-; CHECK-NEXT:    ldr.w r12, [r1, r11, lsl #2]
+; CHECK-NEXT:    ldr.w r12, [r1, r9, lsl #2]
+; CHECK-NEXT:    subs r1, r0, r0
 ; CHECK-NEXT:    ble .LBB2_3
 ; CHECK-NEXT:  @ %bb.6: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #64]
+; CHECK-NEXT:    ldr r7, [sp, #64]
 ; CHECK-NEXT:    mov r6, r12
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mla r7, r11, r3, r1
+; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    mla r7, r9, r7, r3
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:  .LBB2_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB2_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r9
+; CHECK-NEXT:    vadd.i16 q1, q0, r11
 ; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
 ; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r6, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
-; CHECK-NEXT:    vmlava.s16 r10, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    le lr, .LBB2_7
 ; CHECK-NEXT:    b .LBB2_4
 ; CHECK-NEXT:  .LBB2_8: @ %if.end
@@ -401,9 +401,9 @@ define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readon
 ; CHECK-NEXT:    beq .LBB3_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
 ; CHECK-NEXT:    ldr r3, [sp, #64]
-; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r9, [sp, #56]
+; CHECK-NEXT:    ldr.w r11, [sp, #56]
 ; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r3
@@ -417,52 +417,52 @@ define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readon
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_5 Depth 2
 ; CHECK-NEXT:    ldr r1, [sp, #68]
-; CHECK-NEXT:    subs.w lr, r0, r0
-; CHECK-NEXT:    ldr.w r12, [r1, r11, lsl #2]
+; CHECK-NEXT:    ldr.w r12, [r1, r9, lsl #2]
+; CHECK-NEXT:    subs r1, r0, r0
 ; CHECK-NEXT:    ble .LBB3_6
 ; CHECK-NEXT:  @ %bb.4: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB3_3 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #64]
+; CHECK-NEXT:    ldr r7, [sp, #64]
 ; CHECK-NEXT:    mov r6, r12
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mla r7, r11, r3, r1
+; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    mla r7, r9, r7, r3
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:  .LBB3_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB3_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r9
+; CHECK-NEXT:    vadd.i16 q1, q0, r11
 ; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
 ; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r6, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
-; CHECK-NEXT:    vmlava.s16 r10, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    le lr, .LBB3_5
 ; CHECK-NEXT:    b .LBB3_7
 ; CHECK-NEXT:  .LBB3_6: @ in Loop: Header=BB3_3 Depth=1
-; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:  .LBB3_7: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB3_3 Depth=1
 ; CHECK-NEXT:    ldr r3, [sp, #72]
-; CHECK-NEXT:    add.w r1, r10, r8
+; CHECK-NEXT:    add.w r1, r8, r10
 ; CHECK-NEXT:    add r1, r6
 ; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    strb.w r1, [r3, r11]
-; CHECK-NEXT:    add.w r11, r11, #1
-; CHECK-NEXT:    cmp r11, r2
+; CHECK-NEXT:    strb.w r1, [r3, r9]
+; CHECK-NEXT:    add.w r9, r9, #1
+; CHECK-NEXT:    cmp r9, r2
 ; CHECK-NEXT:    bne .LBB3_3
 ; CHECK-NEXT:  .LBB3_8: @ %if.end
 ; CHECK-NEXT:    ldr r0, [sp, #72]
@@ -574,27 +574,35 @@ define i32 @arm_nn_mat_mul_core_4x_s8(i32 %row_elements, i32 %offset, i8* %row_b
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r10, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r10, lr}
+; CHECK-NEXT:    add.w r7, r0, #15
 ; CHECK-NEXT:    ldr.w r12, [sp, #32]
+; CHECK-NEXT:    asrs r6, r7, #31
+; CHECK-NEXT:    add.w r7, r7, r6, lsr #28
+; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    asrs r5, r7, #4
+; CHECK-NEXT:    cmp r5, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    asrgt r6, r7, #4
 ; CHECK-NEXT:    cmp r0, #1
 ; CHECK-NEXT:    blt .LBB4_3
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    adds r5, r2, r1
-; CHECK-NEXT:    add.w r7, r2, r1, lsl #1
+; CHECK-NEXT:    adds r7, r2, r1
+; CHECK-NEXT:    add.w r5, r2, r1, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r1, lsl #1
-; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    dlstp.8 lr, r0
 ; CHECK-NEXT:    add r1, r2
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    dlstp.8 lr, r0
 ; CHECK-NEXT:  .LBB4_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q0, [r3], #16
 ; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
 ; CHECK-NEXT:    vmlava.s8 r10, q1, q0
-; CHECK-NEXT:    vldrb.u8 q1, [r7], #16
-; CHECK-NEXT:    vmlava.s8 r4, q1, q0
 ; CHECK-NEXT:    vldrb.u8 q1, [r5], #16
+; CHECK-NEXT:    vmlava.s8 r4, q1, q0
+; CHECK-NEXT:    vldrb.u8 q1, [r7], #16
 ; CHECK-NEXT:    vmlava.s8 r6, q1, q0
 ; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
 ; CHECK-NEXT:    vmlava.s8 r8, q1, q0
@@ -709,12 +717,12 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
 ; CHECK-NEXT:    lsrs r2, r0, #3
 ; CHECK-NEXT:    b .LBB5_5
 ; CHECK-NEXT:  .LBB5_3: @ in Loop: Header=BB5_5 Depth=1
-; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:  .LBB5_4: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB5_5 Depth=1
-; CHECK-NEXT:    add.w r0, r10, r8
+; CHECK-NEXT:    add.w r0, r8, r10
 ; CHECK-NEXT:    ldr r1, [sp, #96]
 ; CHECK-NEXT:    add r0, r6
 ; CHECK-NEXT:    add r0, r12
@@ -727,21 +735,21 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB5_7 Depth 2
 ; CHECK-NEXT:    ldr r0, [sp, #92]
-; CHECK-NEXT:    subs.w lr, r2, r2
 ; CHECK-NEXT:    ldr.w r12, [r0, r9, lsl #2]
+; CHECK-NEXT:    subs r0, r2, r2
 ; CHECK-NEXT:    ble .LBB5_3
 ; CHECK-NEXT:  @ %bb.6: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB5_5 Depth=1
 ; CHECK-NEXT:    ldr.w r11, [sp, #88]
 ; CHECK-NEXT:    mov r6, r12
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    dlstp.16 lr, r11
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mla r3, r9, r11, r0
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    mla r3, r9, r11, r1
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:  .LBB5_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB5_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -754,10 +762,10 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
 ; CHECK-NEXT:    vmlava.s16 r6, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
-; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
-; CHECK-NEXT:    vmlava.s16 r8, q0, q1
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB5_7
 ; CHECK-NEXT:    b .LBB5_4
 ; CHECK-NEXT:  .LBB5_8: @ %if.end
@@ -899,21 +907,21 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB6_5 Depth 2
 ; CHECK-NEXT:    ldr r0, [sp, #92]
-; CHECK-NEXT:    subs.w lr, r2, r2
 ; CHECK-NEXT:    ldr.w r12, [r0, r9, lsl #2]
+; CHECK-NEXT:    subs r0, r2, r2
 ; CHECK-NEXT:    ble .LBB6_6
 ; CHECK-NEXT:  @ %bb.4: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB6_3 Depth=1
 ; CHECK-NEXT:    ldr.w r11, [sp, #88]
 ; CHECK-NEXT:    mov r6, r12
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    dlstp.16 lr, r11
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mla r3, r9, r11, r0
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    mla r3, r9, r11, r1
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:  .LBB6_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB6_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -926,19 +934,19 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_
 ; CHECK-NEXT:    vmlava.s16 r6, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
-; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
-; CHECK-NEXT:    vmlava.s16 r8, q0, q1
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB6_5
 ; CHECK-NEXT:    b .LBB6_7
 ; CHECK-NEXT:  .LBB6_6: @ in Loop: Header=BB6_3 Depth=1
-; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:  .LBB6_7: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB6_3 Depth=1
-; CHECK-NEXT:    add.w r0, r10, r8
+; CHECK-NEXT:    add.w r0, r8, r10
 ; CHECK-NEXT:    ldr r1, [sp, #96]
 ; CHECK-NEXT:    add r0, r6
 ; CHECK-NEXT:    add r0, r12
@@ -1103,7 +1111,7 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
 ; CHECK-NEXT:    lsrs r2, r1, #3
 ; CHECK-NEXT:    lsls r1, r1, #1
 ; CHECK-NEXT:    str r2, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #32] @ 4-byte Spill
@@ -1111,22 +1119,22 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
 ; CHECK-NEXT:  .LBB7_6: @ Parent Loop BB7_3 Depth=1
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
 ; CHECK-NEXT:    @ Child Loop BB7_7 Depth 3
-; CHECK-NEXT:    add.w r12, r0, #16
-; CHECK-NEXT:    ldr r4, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w lr, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    ldm.w r12, {r1, r2, r3, r12}
-; CHECK-NEXT:    muls r4, r5, r4
-; CHECK-NEXT:    ldr.w r2, [r2, r10, lsl #2]
-; CHECK-NEXT:    ldr.w r1, [r1, r10, lsl #2]
-; CHECK-NEXT:    ldrd r6, r7, [r0, #32]
+; CHECK-NEXT:    ldrd r3, lr, [r0, #24]
+; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r12, r2, [r0, #16]
 ; CHECK-NEXT:    ldr.w r3, [r3, r10, lsl #2]
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    add.w r6, r6, r2, lsl #2
-; CHECK-NEXT:    add.w r12, r12, r1, lsl #2
-; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r2, r1, r4, lsl #2
-; CHECK-NEXT:    add.w r3, r7, r3, lsl #2
+; CHECK-NEXT:    muls r1, r6, r1
+; CHECK-NEXT:    ldr.w r2, [r2, r10, lsl #2]
+; CHECK-NEXT:    ldrd r7, r5, [r0, #32]
+; CHECK-NEXT:    add.w r5, r5, r3, lsl #2
+; CHECK-NEXT:    ldr.w r4, [r12, r10, lsl #2]
+; CHECK-NEXT:    add.w r3, r7, r2, lsl #2
+; CHECK-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    add.w r2, r2, r1, lsl #2
+; CHECK-NEXT:    add.w r12, lr, r4, lsl #2
 ; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:    add.w r8, r1, r11, lsl #2
 ; CHECK-NEXT:    add.w r9, r8, r11, lsl #2
 ; CHECK-NEXT:  .LBB7_7: @ Parent Loop BB7_3 Depth=1
@@ -1145,7 +1153,7 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
 ; CHECK-NEXT:    vsub.f32 q4, q3, q0
 ; CHECK-NEXT:    vadd.f32 q0, q3, q0
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    vldrw.u32 q0, [r6], #16
+; CHECK-NEXT:    vldrw.u32 q0, [r3], #16
 ; CHECK-NEXT:    vcmul.f32 q3, q0, q4, #0
 ; CHECK-NEXT:    vcmla.f32 q3, q0, q4, #90
 ; CHECK-NEXT:    vstrb.8 q3, [r1], #16
@@ -1153,15 +1161,15 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
 ; CHECK-NEXT:    vcmul.f32 q3, q0, q2, #0
 ; CHECK-NEXT:    vcmla.f32 q3, q0, q2, #90
 ; CHECK-NEXT:    vstrb.8 q3, [r8], #16
-; CHECK-NEXT:    vldrw.u32 q0, [r3], #16
+; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
 ; CHECK-NEXT:    vcmul.f32 q2, q0, q1, #0
 ; CHECK-NEXT:    vcmla.f32 q2, q0, q1, #90
 ; CHECK-NEXT:    vstrb.8 q2, [r9], #16
 ; CHECK-NEXT:    le lr, .LBB7_7
 ; CHECK-NEXT:  @ %bb.8: @ in Loop: Header=BB7_6 Depth=2
 ; CHECK-NEXT:    ldr r3, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    adds r5, #1
-; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    adds r6, #1
+; CHECK-NEXT:    cmp r6, r3
 ; CHECK-NEXT:    bne .LBB7_6
 ; CHECK-NEXT:    b .LBB7_2
 ; CHECK-NEXT:  .LBB7_9:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
index 12561d560309a..2e02466844c2e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
@@ -14,9 +14,9 @@ define arm_aapcs_vfpcc void @thres_i32(i32* %data, i16 zeroext %N, i32 %T) {
 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #2
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    rsbs r1, r2, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
@@ -71,9 +71,9 @@ define arm_aapcs_vfpcc void @thresh_i16(i16* %data, i16 zeroext %N, i16 signext
 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #3
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    rsbs r1, r2, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
@@ -128,9 +128,9 @@ define arm_aapcs_vfpcc void @thresh_i8(i8* %data, i16 zeroext %N, i8 signext %T)
 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #4
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #4
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    rsbs r1, r2, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q1, [r0]
@@ -184,10 +184,10 @@ define arm_aapcs_vfpcc void @thresh_f32(float* %data, i16 zeroext %N, float %T)
 ; CHECK-NEXT:    mvn r2, #3
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    movs r2, #1
-; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r1, lsr #2
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    eor r2, r1, #-2147483648
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -241,13 +241,13 @@ define arm_aapcs_vfpcc void @thresh_f16(half* %data, i16 zeroext %N, float %T.co
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    mvn r3, #7
 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
+; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vneg.f16 s0, s0
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
+; CHECK-NEXT:    add.w r3, r3, r1, lsr #3
 ; CHECK-NEXT:    vmov.f16 r1, s0
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
@@ -307,9 +307,9 @@ define arm_aapcs_vfpcc void @thres_rev_i32(i32* %data, i16 zeroext %N, i32 %T) {
 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #2
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    rsbs r1, r2, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
@@ -364,9 +364,9 @@ define arm_aapcs_vfpcc void @thresh_rev_i16(i16* %data, i16 zeroext %N, i16 sign
 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #3
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    rsbs r1, r2, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
@@ -421,9 +421,9 @@ define arm_aapcs_vfpcc void @thresh_rev_i8(i8* %data, i16 zeroext %N, i8 signext
 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #4
+; CHECK-NEXT:    add.w r1, r3, r1, lsr #4
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    rsbs r1, r2, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q1, [r0]
@@ -477,10 +477,10 @@ define arm_aapcs_vfpcc void @thresh_rev_f32(float* %data, i16 zeroext %N, float
 ; CHECK-NEXT:    mvn r2, #3
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    movs r2, #1
-; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r1, lsr #2
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    eor r2, r1, #-2147483648
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -534,13 +534,13 @@ define arm_aapcs_vfpcc void @thresh_rev_f16(half* %data, i16 zeroext %N, float %
 ; CHECK-NEXT:  .LBB9_1: @ %vector.ph
 ; CHECK-NEXT:    mvn r3, #7
 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
+; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vneg.f16 s0, s0
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
+; CHECK-NEXT:    add.w r3, r3, r1, lsr #3
 ; CHECK-NEXT:    vmov.f16 r1, s0
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
index 2d890aaac331e..5ce7d3b7d6a52 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
@@ -9,21 +9,32 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    mov r12, r1
 ; CHECK-NEXT:    vidup.u32 q2, r6, #1
+; CHECK-NEXT:    cmp r1, #4
+; CHECK-NEXT:    it ge
+; CHECK-NEXT:    movge.w r12, #4
+; CHECK-NEXT:    sub.w r6, r1, r12
+; CHECK-NEXT:    adds r6, #3
+; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    adr r4, .LCPI0_0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    add.w r6, lr, r6, lsr #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
+; CHECK-NEXT:    dls lr, r6
 ; CHECK-NEXT:    vmov.i32 q3, #0x4
 ; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB0_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
-; CHECK-NEXT:    vptt.f32 ge, q1, q4
+; CHECK-NEXT:    vctp.32 r12
+; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    vpstttt
+; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
+; CHECK-NEXT:    vcmpt.f32 ge, q1, q4
 ; CHECK-NEXT:    vmovt q1, q4
 ; CHECK-NEXT:    vmovt q0, q2
 ; CHECK-NEXT:    vadd.i32 q2, q2, q3
-; CHECK-NEXT:    letp lr, .LBB0_1
+; CHECK-NEXT:    le lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
 ; CHECK-NEXT:    vldr s8, .LCPI0_1
 ; CHECK-NEXT:    vdup.32 q3, r1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index f586857f289f7..c2b1f5bdafab0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -27,17 +27,17 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    bic r3, r3, #1
 ; CHECK-NEXT:    subs r7, r3, #2
-; CHECK-NEXT:    adr r4, .LCPI0_0
 ; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    adr r4, .LCPI0_0
+; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #1
 ; CHECK-NEXT:    add.w r11, r2, r3, lsl #2
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
 ; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
 ; CHECK-NEXT:    mov.w r10, #-1
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrd r4, r5, [r0]
@@ -108,11 +108,11 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    cmp r7, r3
 ; CHECK-NEXT:    beq .LBB0_8
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader
-; CHECK-NEXT:    sub.w lr, r3, r7
-; CHECK-NEXT:    mov.w r0, #-1
+; CHECK-NEXT:    subs r0, r3, r7
 ; CHECK-NEXT:    mov.w r1, #-2147483648
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    mov.w r0, #-1
 ; CHECK-NEXT:    mvn r2, #-2147483648
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r3, [r12], #4
@@ -248,16 +248,16 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    subs r2, r3, #4
 ; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    add.w r11, r8, r3, lsl #2
+; CHECK-NEXT:    add.w r7, r7, r2, lsr #2
 ; CHECK-NEXT:    add.w r10, r1, r3, lsl #2
-; CHECK-NEXT:    add.w lr, r7, r2, lsr #2
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:    adr r7, .LCPI1_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    adr r7, .LCPI1_1
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r7]
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    mov.w r3, #-1
 ; CHECK-NEXT:    mvn r9, #-2147483648
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
@@ -395,11 +395,11 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    cmp r2, r3
 ; CHECK-NEXT:    beq .LBB1_8
 ; CHECK-NEXT:  .LBB1_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r2
-; CHECK-NEXT:    mov.w r0, #-1
+; CHECK-NEXT:    subs r0, r3, r2
 ; CHECK-NEXT:    mov.w r1, #-2147483648
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    mov.w r0, #-1
 ; CHECK-NEXT:    mvn r3, #-2147483648
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r2, [r12], #4
@@ -536,15 +536,15 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vldrw.u32 q2, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI2_2
 ; CHECK-NEXT:    mov.w r9, #0
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
 ; CHECK-NEXT:    adr r6, .LCPI2_0
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:    subs r7, r3, #1
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q3, [r4]
 ; CHECK-NEXT:    vdup.32 q1, r7
 ; CHECK-NEXT:    mov.w r12, #-1
 ; CHECK-NEXT:    mvn r8, #-2147483648
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -769,12 +769,12 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    bic r5, r3, #1
 ; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    subs r7, r5, #2
+; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    add.w r8, r2, r5, lsl #2
 ; CHECK-NEXT:    add.w r11, r1, r5, lsl #2
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
+; CHECK-NEXT:    add.w r4, r6, r7, lsr #1
 ; CHECK-NEXT:    add.w r12, r0, r5, lsl #2
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:  .LBB3_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrd r4, r9, [r0]
@@ -816,8 +816,8 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    cmp r7, r3
 ; CHECK-NEXT:    beq .LBB3_8
 ; CHECK-NEXT:  .LBB3_6: @ %for.body.preheader
-; CHECK-NEXT:    sub.w lr, r3, r7
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r7
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB3_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r0, [r12], #4
@@ -929,9 +929,9 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    sub.w r7, r8, #4
 ; CHECK-NEXT:    add.w r10, r2, r8, lsl #2
 ; CHECK-NEXT:    add.w r9, r1, r8, lsl #2
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r4, r6, r7, lsr #2
 ; CHECK-NEXT:    add.w r12, r0, r8, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:  .LBB4_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -1007,8 +1007,8 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    cmp r8, r3
 ; CHECK-NEXT:    beq .LBB4_8
 ; CHECK-NEXT:  .LBB4_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r8
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    sub.w r0, r3, r8
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB4_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r0, [r12], #4
@@ -1138,8 +1138,8 @@ define arm_aapcs_vfpcc void @ssatmul_4_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r5
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB5_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh r0, [r12], #2
@@ -1271,8 +1271,8 @@ define arm_aapcs_vfpcc void @ssatmul_8_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB6_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r5
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB6_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh r0, [r12], #2
@@ -1401,8 +1401,8 @@ define arm_aapcs_vfpcc void @ssatmul_8i_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r5
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB7_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh r0, [r12], #2
@@ -1519,11 +1519,11 @@ define arm_aapcs_vfpcc void @ssatmul_s4t_q15(i16* nocapture readonly %pSrcA, i16
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #2
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vdup.32 q2, r3
@@ -1611,13 +1611,13 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    adr r4, .LCPI9_1
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #3
-; CHECK-NEXT:    sub.w r12, r3, #1
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #3
 ; CHECK-NEXT:    vldrw.u32 q4, [r4]
+; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -1785,13 +1785,13 @@ define arm_aapcs_vfpcc void @ssatmul_8ti_q15(i16* nocapture readonly %pSrcA, i16
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    adr r4, .LCPI10_1
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #3
-; CHECK-NEXT:    sub.w r12, r3, #1
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #3
 ; CHECK-NEXT:    vldrw.u32 q4, [r4]
+; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB10_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vdup.32 q6, r3
@@ -1938,9 +1938,9 @@ define arm_aapcs_vfpcc void @usatmul_4_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB11_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r5
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    movw r0, #65535
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB11_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrh r1, [r12], #2
@@ -2072,9 +2072,9 @@ define arm_aapcs_vfpcc void @usatmul_8_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB12_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r5
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    movw r0, #65535
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB12_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrh r1, [r12], #2
@@ -2208,8 +2208,8 @@ define arm_aapcs_vfpcc void @ssatmul_4_q7(i8* nocapture readonly %pSrcA, i8* noc
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB13_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r5
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB13_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsb r0, [r12], #1
@@ -2335,8 +2335,8 @@ define arm_aapcs_vfpcc void @ssatmul_8_q7(i8* nocapture readonly %pSrcA, i8* noc
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB14_6: @ %for.body.preheader23
-; CHECK-NEXT:    sub.w lr, r3, r5
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB14_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsb r0, [r12], #1
@@ -2468,8 +2468,8 @@ define arm_aapcs_vfpcc void @ssatmul_16_q7(i8* nocapture readonly %pSrcA, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB15_6: @ %for.body.preheader23
-; CHECK-NEXT:    sub.w lr, r3, r5
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB15_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsb r0, [r12], #1
@@ -2598,8 +2598,8 @@ define arm_aapcs_vfpcc void @ssatmul_16i_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB16_6: @ %for.body.preheader23
-; CHECK-NEXT:    sub.w lr, r3, r5
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB16_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsb r0, [r12], #1
@@ -2719,13 +2719,13 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q7(i8* nocapture readonly %pSrcA, i8* no
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    adr r4, .LCPI17_1
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #3
-; CHECK-NEXT:    sub.w r12, r3, #1
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #3
 ; CHECK-NEXT:    vldrw.u32 q4, [r4]
+; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB17_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vdup.32 q6, r3
@@ -2840,18 +2840,18 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    adr r4, .LCPI18_1
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #4
-; CHECK-NEXT:    sub.w r12, r3, #1
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #4
+; CHECK-NEXT:    vmov.i8 q3, #0xff
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI18_2
+; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI18_3
-; CHECK-NEXT:    vmov.i8 q3, #0xff
+; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vldrw.u32 q6, [r4]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB18_2: @ %vector.body
@@ -3140,18 +3140,18 @@ define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8*
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    adr r4, .LCPI19_1
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #4
-; CHECK-NEXT:    sub.w r12, r3, #1
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r12, lr, r12, lsr #4
+; CHECK-NEXT:    vmov.i8 q3, #0xff
+; CHECK-NEXT:    dls lr, r12
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI19_2
+; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI19_3
-; CHECK-NEXT:    vmov.i8 q3, #0xff
+; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vldrw.u32 q6, [r4]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB19_2: @ %vector.body
@@ -3372,8 +3372,8 @@ define arm_aapcs_vfpcc void @usatmul_8_q7(i8* nocapture readonly %pSrcA, i8* noc
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB20_6: @ %for.body.preheader23
-; CHECK-NEXT:    sub.w lr, r3, r5
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB20_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r0, [r12], #1
@@ -3507,8 +3507,8 @@ define arm_aapcs_vfpcc void @usatmul_16_q7(i8* nocapture readonly %pSrcA, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB21_6: @ %for.body.preheader23
-; CHECK-NEXT:    sub.w lr, r3, r5
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r0, r3, r5
+; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:  .LBB21_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r0, [r12], #1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index 0f8a88a7e1842..006413638205e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -163,10 +163,10 @@ define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i3
 ; CHECK-NEXT:  .LBB3_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB3_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll b/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll
index 33b3d0ed0b90e..3d3748341387e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll
@@ -9,8 +9,8 @@ define dso_local arm_aapcs_vfpcc void @sink_shl_i32(i32* nocapture readonly %in,
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -53,8 +53,8 @@ define dso_local arm_aapcs_vfpcc void @sink_shl_i16(i16* nocapture readonly %in,
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
@@ -97,8 +97,8 @@ define dso_local arm_aapcs_vfpcc void @sink_shl_i8(i8* nocapture readonly %in, i
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #4
@@ -142,8 +142,8 @@ define dso_local arm_aapcs_vfpcc void @sink_lshr_i32(i32* nocapture readonly %in
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -187,8 +187,8 @@ define dso_local arm_aapcs_vfpcc void @sink_lshr_i16(i16* nocapture readonly %in
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
@@ -232,8 +232,8 @@ define dso_local arm_aapcs_vfpcc void @sink_lshr_i8(i8* nocapture readonly %in,
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB5_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #4
@@ -277,8 +277,8 @@ define dso_local arm_aapcs_vfpcc void @sink_ashr_i32(i32* nocapture readonly %in
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB6_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -322,8 +322,8 @@ define dso_local arm_aapcs_vfpcc void @sink_ashr_i16(i16* nocapture readonly %in
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
@@ -367,8 +367,8 @@ define dso_local arm_aapcs_vfpcc void @sink_ashr_i8(i8* nocapture readonly %in,
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index 2544474932c93..fbde33c3680ce 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -23,10 +23,10 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    subs r0, r3, #4
-; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
-; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    add.w r0, r2, r0, lsr #2
 ; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:  .LBB0_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
@@ -37,13 +37,13 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB0_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
-; CHECK-NEXT:    add.w r1, r12, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB0_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r2, [r1], #4
-; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    ldr r1, [r2], #4
+; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    le lr, .LBB0_8
 ; CHECK-NEXT:  .LBB0_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -113,9 +113,9 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    bic r12, r1, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x1
 ; CHECK-NEXT:    sub.w r3, r12, #4
-; CHECK-NEXT:    add.w lr, r2, r3, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r3, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
@@ -132,9 +132,9 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    mul r2, r2, lr
 ; CHECK-NEXT:    beq .LBB1_8
 ; CHECK-NEXT:  .LBB1_6: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r12
+; CHECK-NEXT:    sub.w r1, r1, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB1_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
@@ -213,9 +213,9 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    vmov.i8 q0, #0xff
-; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r12, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
@@ -232,9 +232,9 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    and.w r2, r2, r12
 ; CHECK-NEXT:    beq .LBB2_9
 ; CHECK-NEXT:  .LBB2_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
+; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB2_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
@@ -313,9 +313,9 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r12, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
@@ -332,9 +332,9 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    orr.w r2, r2, r12
 ; CHECK-NEXT:    beq .LBB3_9
 ; CHECK-NEXT:  .LBB3_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
+; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB3_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
@@ -413,9 +413,9 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r12, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
@@ -432,9 +432,9 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    eor.w r2, r2, r12
 ; CHECK-NEXT:    beq .LBB4_9
 ; CHECK-NEXT:  .LBB4_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
+; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB4_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
@@ -513,9 +513,9 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w r12, r2, #4
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    mov r3, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
@@ -528,9 +528,9 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vadd.f32 s0, s0, s4
 ; CHECK-NEXT:    beq .LBB5_9
 ; CHECK-NEXT:  .LBB5_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r2
+; CHECK-NEXT:    subs r1, r1, r2
 ; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB5_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr s2, [r0]
@@ -614,9 +614,9 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w r12, r2, #4
 ; CHECK-NEXT:    vmov.f32 q0, #1.000000e+00
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    mov r3, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
@@ -629,9 +629,9 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vmul.f32 s0, s0, s4
 ; CHECK-NEXT:    beq .LBB6_9
 ; CHECK-NEXT:  .LBB6_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r2
+; CHECK-NEXT:    subs r1, r1, r2
 ; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB6_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr s2, [r0]
@@ -711,9 +711,9 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    vmvn.i32 q0, #0x80000000
-; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r12, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
@@ -725,9 +725,9 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vminv.s32 r2, q0
 ; CHECK-NEXT:    beq .LBB7_9
 ; CHECK-NEXT:  .LBB7_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
+; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB7_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
@@ -809,10 +809,10 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    subs r0, r3, #4
-; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
-; CHECK-NEXT:    mvn r0, #-2147483648
+; CHECK-NEXT:    add.w r0, r2, r0, lsr #2
 ; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    mvn r0, #-2147483648
 ; CHECK-NEXT:  .LBB8_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
@@ -823,14 +823,14 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB8_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
-; CHECK-NEXT:    add.w r1, r12, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB8_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r2, [r1], #4
-; CHECK-NEXT:    cmp r0, r2
-; CHECK-NEXT:    csel r0, r0, r2, lt
+; CHECK-NEXT:    ldr r1, [r2], #4
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, lt
 ; CHECK-NEXT:    le lr, .LBB8_8
 ; CHECK-NEXT:  .LBB8_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -907,9 +907,9 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    vmov.i32 q0, #0x80000000
-; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r12, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
@@ -921,9 +921,9 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vmaxv.s32 r2, q0
 ; CHECK-NEXT:    beq .LBB9_9
 ; CHECK-NEXT:  .LBB9_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
+; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB9_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
@@ -1005,10 +1005,10 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    subs r0, r3, #4
-; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
-; CHECK-NEXT:    mov.w r0, #-2147483648
+; CHECK-NEXT:    add.w r0, r2, r0, lsr #2
 ; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    mov.w r0, #-2147483648
 ; CHECK-NEXT:  .LBB10_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
@@ -1019,14 +1019,14 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB10_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
-; CHECK-NEXT:    add.w r1, r12, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB10_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r2, [r1], #4
-; CHECK-NEXT:    cmp r0, r2
-; CHECK-NEXT:    csel r0, r0, r2, gt
+; CHECK-NEXT:    ldr r1, [r2], #4
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, gt
 ; CHECK-NEXT:    le lr, .LBB10_8
 ; CHECK-NEXT:  .LBB10_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -1103,9 +1103,9 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    vmov.i8 q0, #0xff
-; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r12, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB11_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
@@ -1117,9 +1117,9 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vminv.u32 r2, q0
 ; CHECK-NEXT:    beq .LBB11_9
 ; CHECK-NEXT:  .LBB11_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
+; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB11_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
@@ -1201,10 +1201,10 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    subs r0, r3, #4
-; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
-; CHECK-NEXT:    mov.w r0, #-1
+; CHECK-NEXT:    add.w r0, r2, r0, lsr #2
 ; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    mov.w r0, #-1
 ; CHECK-NEXT:  .LBB12_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
@@ -1215,14 +1215,14 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB12_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
-; CHECK-NEXT:    add.w r1, r12, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB12_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r2, [r1], #4
-; CHECK-NEXT:    cmp r0, r2
-; CHECK-NEXT:    csel r0, r0, r2, hi
+; CHECK-NEXT:    ldr r1, [r2], #4
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, hi
 ; CHECK-NEXT:    le lr, .LBB12_8
 ; CHECK-NEXT:  .LBB12_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -1299,9 +1299,9 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
+; CHECK-NEXT:    add.w r2, r2, r12, lsr #2
+; CHECK-NEXT:    dls lr, r2
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB13_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
@@ -1313,9 +1313,9 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vmaxv.u32 r2, q0
 ; CHECK-NEXT:    beq .LBB13_9
 ; CHECK-NEXT:  .LBB13_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
+; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB13_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
@@ -1397,10 +1397,10 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    subs r0, r3, #4
-; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
-; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    add.w r0, r2, r0, lsr #2
 ; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:  .LBB14_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
@@ -1411,14 +1411,14 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB14_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r3
-; CHECK-NEXT:    add.w r1, r12, r3, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB14_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r2, [r1], #4
-; CHECK-NEXT:    cmp r0, r2
-; CHECK-NEXT:    csel r0, r0, r2, hi
+; CHECK-NEXT:    ldr r1, [r2], #4
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, hi
 ; CHECK-NEXT:    le lr, .LBB14_8
 ; CHECK-NEXT:  .LBB14_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -1495,9 +1495,9 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w r12, r2, #4
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    mov r3, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB15_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
@@ -1511,9 +1511,9 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    beq .LBB15_9
 ; CHECK-NEXT:  .LBB15_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r2
+; CHECK-NEXT:    subs r1, r1, r2
 ; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB15_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldmia r0!, {s2}
@@ -1600,9 +1600,9 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w r12, r2, #4
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    mov r3, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB16_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
@@ -1616,9 +1616,9 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    beq .LBB16_9
 ; CHECK-NEXT:  .LBB16_7: @ %for.body.preheader1
-; CHECK-NEXT:    sub.w lr, r1, r2
+; CHECK-NEXT:    subs r1, r1, r2
 ; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:  .LBB16_8: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldmia r0!, {s2}
@@ -1690,8 +1690,8 @@ define i32 @add4i32(i32* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    cbz r1, .LBB17_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.32 lr, r1
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB17_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -1795,8 +1795,8 @@ define i32 @add8i32(i16* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    cbz r1, .LBB19_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.16 lr, r1
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB19_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
@@ -1903,8 +1903,8 @@ define i32 @add16i32(i8* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    cbz r1, .LBB21_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.8 lr, r1
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB21_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
@@ -2011,8 +2011,8 @@ define signext i16 @add8i16(i16* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    cbz r1, .LBB23_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.16 lr, r1
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB23_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
@@ -2116,8 +2116,8 @@ define signext i16 @add16i16(i8* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    cbz r1, .LBB25_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.8 lr, r1
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB25_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
@@ -2224,8 +2224,8 @@ define zeroext i8 @add16i8(i8* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    cbz r1, .LBB27_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.8 lr, r1
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB27_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
@@ -2329,9 +2329,9 @@ define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    cbz r1, .LBB29_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
+; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:  .LBB29_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -2383,8 +2383,8 @@ define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture r
 ; CHECK-NEXT:    cbz r2, .LBB30_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:  .LBB30_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -2442,8 +2442,8 @@ define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture r
 ; CHECK-NEXT:    cbz r2, .LBB31_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:  .LBB31_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
index c04243ee57545..567f90b7683a5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
@@ -20,32 +20,27 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* noc
 ; CHECK-NEXT:    bhi .LBB0_9
 ; CHECK-NEXT:  @ %bb.3: @ %vector.ph
 ; CHECK-NEXT:    bic r4, r2, #7
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    sub.w r12, r4, #8
-; CHECK-NEXT:    and r7, r2, #7
-; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
+; CHECK-NEXT:    movs r5, #1
+; CHECK-NEXT:    sub.w r3, r4, #8
 ; CHECK-NEXT:    add.w r12, r1, r4, lsl #1
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    add.w r5, r5, r3, lsr #3
 ; CHECK-NEXT:    add.w r3, r0, r4, lsl #2
+; CHECK-NEXT:    dls lr, r5
+; CHECK-NEXT:    and r5, r2, #7
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
-; CHECK-NEXT:    mov lr, r5
-; CHECK-NEXT:    subs.w lr, lr, #1
 ; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
-; CHECK-NEXT:    mov r5, lr
 ; CHECK-NEXT:    vmul.f16 q2, q0, q0
 ; CHECK-NEXT:    vfma.f16 q2, q1, q1
 ; CHECK-NEXT:    vstrb.8 q2, [r1], #16
-; CHECK-NEXT:    bne .LBB0_4
-; CHECK-NEXT:    b .LBB0_5
-; CHECK-NEXT:  .LBB0_5: @ %middle.block
+; CHECK-NEXT:    le lr, .LBB0_4
+; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r4, r2
-; CHECK-NEXT:    mov lr, r7
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r7, pc}
 ; CHECK-NEXT:  .LBB0_6: @ %while.body.preheader26
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r5
 ; CHECK-NEXT:  .LBB0_7: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr.16 s0, [r3]
@@ -61,7 +56,7 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* noc
 ; CHECK-NEXT:  .LBB0_9:
 ; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    mov lr, r2
+; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    b .LBB0_6
 entry:
   %cmp.not11 = icmp eq i32 %numSamples, 0
@@ -156,32 +151,27 @@ define void @arm_cmplx_mag_squared_f32(float* nocapture readonly %pSrc, float* n
 ; CHECK-NEXT:    bhi .LBB1_9
 ; CHECK-NEXT:  @ %bb.3: @ %vector.ph
 ; CHECK-NEXT:    bic r4, r2, #3
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    sub.w r12, r4, #4
-; CHECK-NEXT:    and r7, r2, #3
-; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
+; CHECK-NEXT:    movs r5, #1
+; CHECK-NEXT:    subs r3, r4, #4
 ; CHECK-NEXT:    add.w r12, r1, r4, lsl #2
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    add.w r5, r5, r3, lsr #2
 ; CHECK-NEXT:    add.w r3, r0, r4, lsl #3
+; CHECK-NEXT:    dls lr, r5
+; CHECK-NEXT:    and r5, r2, #3
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.32 {q0, q1}, [r0]
-; CHECK-NEXT:    mov lr, r5
-; CHECK-NEXT:    subs.w lr, lr, #1
 ; CHECK-NEXT:    vld21.32 {q0, q1}, [r0]!
-; CHECK-NEXT:    mov r5, lr
 ; CHECK-NEXT:    vmul.f32 q2, q0, q0
 ; CHECK-NEXT:    vfma.f32 q2, q1, q1
 ; CHECK-NEXT:    vstrb.8 q2, [r1], #16
-; CHECK-NEXT:    bne .LBB1_4
-; CHECK-NEXT:    b .LBB1_5
-; CHECK-NEXT:  .LBB1_5: @ %middle.block
+; CHECK-NEXT:    le lr, .LBB1_4
+; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r4, r2
-; CHECK-NEXT:    mov lr, r7
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, r5, r7, pc}
 ; CHECK-NEXT:  .LBB1_6: @ %while.body.preheader26
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dls lr, r5
 ; CHECK-NEXT:  .LBB1_7: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldr s0, [r3]
@@ -197,7 +187,7 @@ define void @arm_cmplx_mag_squared_f32(float* nocapture readonly %pSrc, float* n
 ; CHECK-NEXT:  .LBB1_9:
 ; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    mov lr, r2
+; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    b .LBB1_6
 entry:
   %cmp.not11 = icmp eq i32 %numSamples, 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
index 27d5db41c1511..fcf372b60380d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
@@ -20,8 +20,8 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n
 ; CHECK-NEXT:    and.w r3, r3, r12, lsr #2
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q5, [r0, #32]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
index 995ac7d88fde6..f7e42bd2dad3f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
@@ -186,8 +186,8 @@ define void @vqdmulh_loop_i8(i8* nocapture readonly %x, i8* nocapture readonly %
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    mov.w lr, #64
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    movs r3, #64
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
@@ -231,8 +231,8 @@ define void @vqdmulh_loop_i16(i16* nocapture readonly %x, i16* nocapture readonl
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    mov.w lr, #128
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    movs r3, #128
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
@@ -276,8 +276,8 @@ define void @vqdmulh_loop_i32(i32* nocapture readonly %x, i32* nocapture readonl
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    mov.w lr, #256
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    mov.w r3, #256
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB9_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16

diff  --git a/llvm/test/Transforms/HardwareLoops/ARM/calls-codegen.ll b/llvm/test/Transforms/HardwareLoops/ARM/calls-codegen.ll
index 72b5bb0c7be26..4c455322bc0ce 100644
--- a/llvm/test/Transforms/HardwareLoops/ARM/calls-codegen.ll
+++ b/llvm/test/Transforms/HardwareLoops/ARM/calls-codegen.ll
@@ -4,8 +4,8 @@
 ; DISABLED-NOT: dls lr,
 
 ; CHECK-LABEL: test_target_specific:
-; CHECK:        mov.w lr, #50
-; CHECK:        dls lr, lr
+; CHECK:        movs r2, #50
+; CHECK:        dls lr, r2
 ; CHECK-NOT:    mov lr,
 ; CHECK:      [[LOOP_HEADER:\.LBB[0-9_]+]]:
 ; CHECK:        le lr, [[LOOP_HEADER]]
@@ -31,8 +31,8 @@ exit:
 }
 
 ; CHECK-LABEL: test_fabs:
-; CHECK:        mov.w lr, #100
-; CHECK:        dls lr, lr
+; CHECK:        movs r1, #100
+; CHECK:        dls lr, r1
 ; CHECK-NOT:    mov lr,
 ; CHECK:      [[LOOP_HEADER:\.LBB[0-9_]+]]:
 ; CHECK-NOT:    bl

diff  --git a/llvm/test/Transforms/HardwareLoops/ARM/calls.ll b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll
index 1cbe414166517..80daff02c5665 100644
--- a/llvm/test/Transforms/HardwareLoops/ARM/calls.ll
+++ b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll
@@ -8,7 +8,7 @@
 ; DISABLED-NOT: call i32 @llvm.loop.decrement
 
 ; CHECK-LABEL: skip_call
-; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.start.loop.iterations
 ; CHECK-NOT: call i32 @llvm.loop.decrement
 
 define i32 @skip_call(i32 %n) {
@@ -37,8 +37,8 @@ while.end:
 }
 
 ; CHECK-LABEL: test_target_specific
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 50)
-; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 50)
+; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK: br i1 [[CMP]], label %loop, label %exit
@@ -62,10 +62,10 @@ exit:
 }
 
 ; CHECK-LABEL: test_fabs_f16
-; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
-; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
-; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-MVE-NOT:  call i32 @llvm.start.loop.iterations
+; CHECK-FP:       call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call i32 @llvm.start.loop.iterations.i32(i32 100)
 define void @test_fabs_f16(half* %a, half* %b) {
 entry:
   br label %loop
@@ -84,10 +84,10 @@ exit:
 }
 
 ; CHECK-LABEL: test_fabs
-; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
-; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
-; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-MVE-NOT:  call i32 @llvm.start.loop.iterations
+; CHECK-FP:       call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call i32 @llvm.start.loop.iterations.i32(i32 100)
 
 define float @test_fabs(float* %a) {
 entry:
@@ -107,11 +107,11 @@ exit:
 }
 
 ; CHECK-LABEL: test_fabs_64
-; CHECK-MAIN-NOT:   call void @llvm.set.loop.iterations 
-; CHECK-MVE-NOT:    call void @llvm.set.loop.iterations
-; CHECK-FP-NOT:     call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-FP64:       void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVEFP-NOT:  call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MAIN-NOT:   call i32 @llvm.start.loop.iterations
+; CHECK-MVE-NOT:    call i32 @llvm.start.loop.iterations
+; CHECK-FP-NOT:     call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-FP64:       call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVEFP-NOT:  call i32 @llvm.start.loop.iterations.i32(i32 100)
 define void @test_fabs_64(double* %a, double* %b) {
 entry:
   br label %loop
@@ -130,9 +130,9 @@ exit:
 }
 
 ; CHECK-LABEL: test_fabs_vec
-; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
-; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-MVEFP: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
 ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
 ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
@@ -154,7 +154,7 @@ exit:
 }
 
 ; CHECK-LABEL: test_log
-; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.start.loop.iterations
 ; CHECK-NOT: llvm.loop.decrement
 define float @test_log(float* %a) {
 entry:
@@ -174,11 +174,11 @@ exit:
 }
 
 ; CHECK-LABEL: test_sqrt_16
-; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
-; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
-; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-FP64:     call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-MVE-NOT:  call i32 @llvm.start.loop.iterations
+; CHECK-FP:       call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-FP64:     call i32 @llvm.start.loop.iterations.i32(i32 100)
 define void @test_sqrt_16(half* %a, half* %b) {
 entry:
   br label %loop
@@ -196,11 +196,11 @@ exit:
   ret void
 }
 ; CHECK-LABEL: test_sqrt
-; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
-; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
-; CHECK-FP: call void @llvm.set.loop.iterations
-; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-FP: call i32 @llvm.start.loop.iterations
+; CHECK-MVEFP: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
 ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
 ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
@@ -222,11 +222,11 @@ exit:
 }
 
 ; CHECK-LABEL: test_sqrt_64
-; CHECK-MAIN-NOT:   call void @llvm.set.loop.iterations 
-; CHECK-MVE-NOT:    call void @llvm.set.loop.iterations
-; CHECK-FP-NOT:     call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVEFP-NOT:  call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-FP64:       call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MAIN-NOT:   call i32 @llvm.start.loop.iterations
+; CHECK-MVE-NOT:    call i32 @llvm.start.loop.iterations
+; CHECK-FP-NOT:     call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVEFP-NOT:  call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-FP64:       call i32 @llvm.start.loop.iterations.i32(i32 100)
 define void @test_sqrt_64(double* %a, double* %b) {
 entry:
   br label %loop
@@ -245,10 +245,10 @@ exit:
 }
 
 ; CHECK-LABEL: test_sqrt_vec
-; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations 
-; CHECK-MVE-NOT:  call void @llvm.set.loop.iterations
-; CHECK-FP:       call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVEFP:    call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-MVE-NOT:  call i32 @llvm.start.loop.iterations
+; CHECK-FP:       call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVEFP:    call i32 @llvm.start.loop.iterations.i32(i32 100)
 define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
 entry:
   br label %loop
@@ -267,7 +267,7 @@ exit:
 }
 
 ; CHECK-LABEL: test_overflow
-; CHECK: call void @llvm.set.loop.iterations
+; CHECK: call i32 @llvm.start.loop.iterations
 define i32 @test_overflow(i32* %a, i32* %b) {
 entry:
   br label %loop
@@ -289,7 +289,7 @@ exit:
 
 ; TODO: We should be able to generate a qadd/sub
 ; CHECK-LABEL: test_sat
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 100)
+; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 100)
 define i32 @test_sat(i32* %a, i32* %b) {
 entry:
   br label %loop
@@ -309,10 +309,10 @@ exit:
 }
 
 ; CHECK-LABEL: test_masked_i32
-; CHECK-NOT: call void @llvm.set.loop.iterations
-; CHECK-MVEFP: call void @llvm.set.loop.iterations
-; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-MVEFP: call i32 @llvm.start.loop.iterations
+; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
@@ -336,10 +336,10 @@ exit:
 }
 
 ; CHECK-LABEL: test_masked_f32
-; CHECK-NOT: call void @llvm.set.loop.iterations
-; CHECK-MVEFP: call void @llvm.set.loop.iterations
-; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-MVEFP: call i32 @llvm.start.loop.iterations
+; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
@@ -363,10 +363,10 @@ exit:
 }
 
 ; CHECK-LABEL: test_gather_scatter
-; CHECK-NOT: call void @llvm.set.loop.iterations
-; CHECK-MVEFP: call void @llvm.set.loop.iterations
-; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
-; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
+; CHECK-NOT: call i32 @llvm.start.loop.iterations
+; CHECK-MVEFP: call i32 @llvm.start.loop.iterations
+; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
+; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit

diff  --git a/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll b/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
index c06e5fe4416ab..6c80accf377f5 100644
--- a/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
+++ b/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
@@ -2,17 +2,17 @@
 ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+soft-float -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT
 
 ; CHECK-LABEL: test_fptosi
-; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+; CHECK-SOFT-NOT: call i32 @llvm.start.loop.iterations
 
 ; CHECK: entry:
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
 
 ; CHECK: while.body.lr.ph:
-; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-FP: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK-FP-NEXT: br label %while.body
 
-; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
 ; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
@@ -59,15 +59,15 @@ cleanup:
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
 ; CHECK-FP: while.body.lr.ph:
-; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-FP: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK-FP-NEXT: br label %while.body
 
-; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
 ; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
 
-; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+; CHECK-SOFT-NOT: call i32 @llvm.start.loop.iterations
 
 define void @test_fptoui(i32 %n, i32** %g, double** %d) {
 entry:
@@ -111,10 +111,10 @@ cleanup:
 ; CHECK:   [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
 ; CHECK:   [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
 ; CHECK: while.body.lr.ph:
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK:   [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK-NEXT: br label %while.body
 
-; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
@@ -156,17 +156,17 @@ cleanup:
 }
 
 ; CHECK-LABEL: fp_add
-; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+; CHECK-SOFT-NOT: call i32 @llvm.start.loop.iterations
 ; CHECK: entry:
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
 ; CHECK: while.body.lr.ph:
-; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-FP: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK: br label %while.body
 
 ; CHECK-SOFT-NOT: call i32 @llvm.loop.decrement
 
-; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
+; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
 ; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit

diff  --git a/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll b/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll
index 2ec2ce7c9f9f6..96d1729b1a884 100644
--- a/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll
+++ b/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll
@@ -8,10 +8,10 @@
 @g = common local_unnamed_addr global i32* null, align 4
 
 ; CHECK-LABEL: do_copy
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
+; CHECK: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 %n)
 ; CHECK: br label %while.body
 
-; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %entry ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end
@@ -99,10 +99,10 @@ while.end:
 ; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1
 
 ; CHECK: while.body.lr.ph:
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK:   [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK:   br label %while.body
 ; CHECK: while.body:
-; CHECK:   [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK:   [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
 ; CHECK:   [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
 ; CHECK:   [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK:   br i1 [[CMP]], label %while.body, label %while.end.loopexit
@@ -152,10 +152,10 @@ while.end:
 ; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1
 
 ; CHECK: while.body.lr.ph:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK: br label %while.body
 
-; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit

diff  --git a/llvm/test/Transforms/HardwareLoops/ARM/structure.ll b/llvm/test/Transforms/HardwareLoops/ARM/structure.ll
index 543fdf7b288f2..06eed80510f7c 100644
--- a/llvm/test/Transforms/HardwareLoops/ARM/structure.ll
+++ b/llvm/test/Transforms/HardwareLoops/ARM/structure.ll
@@ -51,13 +51,13 @@ do.end:
 }
 
 ; CHECK-LABEL: nested
-; CHECK-NOT: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-NOT: call i32 @llvm.start.loop.iterations.i32(i32 %N)
 ; CHECK: br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
 
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 %N)
 ; CHECK: br label %while.body3.us
 
-; CHECK: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us
@@ -103,19 +103,19 @@ while.end7:
 }
 
 ; CHECK-LABEL: pre_existing
-; CHECK: llvm.set.loop.iterations
-; CHECK-NOT: llvm.set.loop.iterations
+; CHECK: llvm.start.loop.iterations
+; CHECK-NOT: llvm.start.loop.iterations
 ; CHECK: call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 ; CHECK-NOT: call i32 @llvm.loop.decrement.reg
 define i32 @pre_existing(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 %n)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %n)
   br label %while.body
 
 while.body:                                       ; preds = %while.body, %entry
   %q.addr.05 = phi i32* [ %incdec.ptr, %while.body ], [ %q, %entry ]
   %p.addr.04 = phi i32* [ %incdec.ptr1, %while.body ], [ %p, %entry ]
-  %0 = phi i32 [ %n, %entry ], [ %2, %while.body ]
+  %0 = phi i32 [ %start, %entry ], [ %2, %while.body ]
   %incdec.ptr = getelementptr inbounds i32, i32* %q.addr.05, i32 1
   %1 = load i32, i32* %q.addr.05, align 4
   %incdec.ptr1 = getelementptr inbounds i32, i32* %p.addr.04, i32 1
@@ -158,9 +158,9 @@ while.end:                                        ; preds = %while.body
 }
 
 ; CHECK-LABEL: pre_existing_inner
-; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.start.loop.iterations
 ; CHECK: while.cond1.preheader.us:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 %N)
 ; CHECK: call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 ; CHECK: br i1
 ; CHECK-NOT: call i32 @llvm.loop.decrement
@@ -172,12 +172,12 @@ entry:
 while.cond1.preheader.us:
   %i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ]
   %mul.us = mul i32 %i.021.us, %N
-  call void @llvm.set.loop.iterations.i32(i32 %N)
+  %start = call i32 @llvm.start.loop.iterations.i32(i32 %N)
   br label %while.body3.us
 
 while.body3.us:
   %j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ]
-  %0 = phi i32 [ %N, %while.cond1.preheader.us ], [ %1, %while.body3.us ]
+  %0 = phi i32 [ %start, %while.cond1.preheader.us ], [ %1, %while.body3.us ]
   %add.us = add i32 %j.019.us, %mul.us
   %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
   store i32 %add.us, i32* %arrayidx.us, align 4
@@ -196,7 +196,7 @@ while.end7:
 }
 
 ; CHECK-LABEL: not_rotated
-; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.start.loop.iterations
 ; CHECK-NOT: call i32 @llvm.loop.decrement.i32
 define void @not_rotated(i32, i16* nocapture, i16 signext) {
   br label %4
@@ -233,7 +233,7 @@ define void @not_rotated(i32, i16* nocapture, i16 signext) {
 }
 
 ; CHECK-LABEL: multi_latch
-; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.start.loop.iterations
 ; CHECK-NOT: call i32 @llvm.loop.decrement
 define void @multi_latch(i32* %a, i32* %b, i32 %N) {
 entry:
@@ -322,7 +322,7 @@ for.inc:                                          ; preds = %sw.bb, %sw.bb1, %fo
 }
 
 ; CHECK-LABEL: unroll_inc_int
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 %N)
 ; CHECK: call i32 @llvm.loop.decrement.reg.i32(
 
 ; TODO: We should be able to support the unrolled loop body.
@@ -404,7 +404,7 @@ for.body:
 }
 
 ; CHECK-LABEL: unroll_dec_int
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 %N)
 ; CHECK: call i32 @llvm.loop.decrement.reg.i32(
 
 ; TODO: An unnecessary register is being held to hold COUNT, lr should just
@@ -420,7 +420,7 @@ for.body:
 ; CHECK-UNROLL-NEXT: [[PROLOGUE:.LBB[0-9_]+]]:
 ; CHECK-UNROLL:         le lr, [[PROLOGUE]]
 ; CHECK-UNROLL-NEXT: [[PROLOGUE_EXIT:.LBB[0-9_]+]]:
-; CHECK-UNROLL:         dls lr, lr
+; CHECK-UNROLL:         dls lr, r5
 ; CHECK-UNROLL:      [[BODY:.LBB[0-9_]+]]:
 ; CHECK-UNROLL:         le lr, [[BODY]]
 ; CHECK-UNROLL-NOT:     b
@@ -447,7 +447,7 @@ for.body:
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
-declare void @llvm.set.loop.iterations.i32(i32) #0
+declare i32 @llvm.start.loop.iterations.i32(i32) #0
 declare i1 @llvm.test.set.loop.iterations.i32(i32) #0
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #0
 

diff  --git a/llvm/test/Transforms/HardwareLoops/loop-guards.ll b/llvm/test/Transforms/HardwareLoops/loop-guards.ll
index 652a5d872c0d0..83fbc5a00b16b 100644
--- a/llvm/test/Transforms/HardwareLoops/loop-guards.ll
+++ b/llvm/test/Transforms/HardwareLoops/loop-guards.ll
@@ -11,7 +11,8 @@
 ; CHECK:   [[COUNT:%[^ ]+]] = add i32 [[MAX]], -1
 ; CHECK:   br i1 %t1, label %do.body.preheader
 ; CHECK: do.body.preheader:
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-EXIT:   call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-LATCH:   call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK:   br label %do.body
 define void @test1(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
@@ -36,6 +37,7 @@ if.end:                                           ; preds = %do.body, %entry
 ; CHECK-LABEL: test2
 ; CHECK-NOT: call i1 @llvm.test.set.loop.iterations
 ; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call i32 @llvm.start.loop.iterations
 define void @test2(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   br i1 %t1, label %do.body, label %if.end
@@ -62,7 +64,8 @@ if.end:                                           ; preds = %do.body, %entry
 ; CHECK:   [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %N, i32 1
 ; CHECK:   br i1 %brmerge.demorgan, label %do.body.preheader
 ; CHECK: do.body.preheader:
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-EXIT:   call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-LATCH:   call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK:   br label %do.body
 define void @test3(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
@@ -88,7 +91,7 @@ if.end:                                           ; preds = %do.body, %entry
 ; CHECK-LABEL: test4
 ; CHECK: entry:
 ; CHECK-LATCH:  br i1 %brmerge.demorgan, label %while.cond
-; CHECK-LATCH-NOT: call void @llvm{{.*}}loop.iterations 
+; CHECK-LATCH-NOT: @llvm{{.*}}loop.iterations 
 ; CHECK-EXIT:   br i1 %brmerge.demorgan, label %while.cond.preheader
 ; CHECK-EXIT: while.cond.preheader:
 ; CHECK-EXIT:   [[COUNT:%[^ ]+]] = add i32 %N, 1
@@ -122,7 +125,8 @@ if.end:                                           ; preds = %while.cond, %entry
 ; CHECK: entry:
 ; CHECK:   br i1 %or.cond, label %while.body.preheader
 ; CHECK: while.body.preheader:
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-EXIT:   call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-LATCH:   call i32 @llvm.start.loop.iterations.i32(i32 %N)
 ; CHECK:   br label %while.body
 define void @test5(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
@@ -221,7 +225,8 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: while.preheader:
 ; CHECK:   br i1 %brmerge.demorgan, label %while.body.preheader
 ; CHECK: while.body.preheader:
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-EXIT:   call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-LATCH:   call i32 @llvm.start.loop.iterations.i32(i32 %N)
 ; CHECK:   br label %while.body
 define void @test8(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
@@ -252,7 +257,8 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: entry:
 ; CHECK:   br i1 %brmerge.demorgan, label %do.body.preheader
 ; CHECK: do.body.preheader:
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-EXIT:   call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-LATCH:   call i32 @llvm.start.loop.iterations.i32(i32 %N)
 ; CHECK:   br label %do.body
 define void @test9(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
@@ -280,7 +286,8 @@ if.end:                                           ; preds = %do.body, %entry
 ; CHECK: entry:
 ; CHECK:   br i1 %cmp.1, label %do.body.preheader
 ; CHECK: do.body.preheader:
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32
+; CHECK-EXIT:   call void @llvm.set.loop.iterations.i32(i32
+; CHECK-LATCH:   call i32 @llvm.start.loop.iterations.i32(i32
 ; CHECK:   br label %do.body
 define void @test10(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:

diff  --git a/llvm/test/Transforms/HardwareLoops/scalar-while.ll b/llvm/test/Transforms/HardwareLoops/scalar-while.ll
index acb9efd3b72b3..fb614ea275a0d 100644
--- a/llvm/test/Transforms/HardwareLoops/scalar-while.ll
+++ b/llvm/test/Transforms/HardwareLoops/scalar-while.ll
@@ -30,17 +30,17 @@ define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) {
 ; CHECK-PHI-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
 ; CHECK-PHI:       while.body.preheader:
 ; CHECK-PHI-NEXT:    [[TMP0:%.*]] = sub i32 [[N]], [[I]]
-; CHECK-PHI-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]])
 ; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK-PHI:       while.body:
 ; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
 ; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
 ; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
 ; CHECK-PHI-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
-; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
-; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
+; CHECK-PHI-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
 ; CHECK-PHI:       while.end:
 ; CHECK-PHI-NEXT:    ret void
 ;
@@ -86,17 +86,17 @@ define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) {
 ; CHECK-PHIGUARD-NEXT:    [[TMP0:%.*]] = sub i32 [[N]], [[I]]
 ; CHECK-PHIGUARD-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
 ; CHECK-PHIGUARD:       while.body.preheader:
-; CHECK-PHIGUARD-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-PHIGUARD-NEXT:    [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]])
 ; CHECK-PHIGUARD-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK-PHIGUARD:       while.body:
 ; CHECK-PHIGUARD-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PHIGUARD-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHIGUARD-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
 ; CHECK-PHIGUARD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
 ; CHECK-PHIGUARD-NEXT:    store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
 ; CHECK-PHIGUARD-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
-; CHECK-PHIGUARD-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
-; CHECK-PHIGUARD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-PHIGUARD-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHIGUARD-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
+; CHECK-PHIGUARD-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-PHIGUARD-NEXT:    br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
 ; CHECK-PHIGUARD:       while.end:
 ; CHECK-PHIGUARD-NEXT:    ret void
 ;
@@ -160,17 +160,17 @@ define void @while_gt(i32 %i, i32 %N, i32* nocapture %A) {
 ; CHECK-PHI-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
 ; CHECK-PHI:       while.body.preheader:
 ; CHECK-PHI-NEXT:    [[TMP0:%.*]] = sub i32 [[I]], [[N]]
-; CHECK-PHI-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]])
 ; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK-PHI:       while.body:
 ; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
 ; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
 ; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
 ; CHECK-PHI-NEXT:    [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
-; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
-; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
+; CHECK-PHI-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
 ; CHECK-PHI:       while.end:
 ; CHECK-PHI-NEXT:    ret void
 ;
@@ -216,17 +216,17 @@ define void @while_gt(i32 %i, i32 %N, i32* nocapture %A) {
 ; CHECK-PHIGUARD-NEXT:    [[TMP0:%.*]] = sub i32 [[I]], [[N]]
 ; CHECK-PHIGUARD-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
 ; CHECK-PHIGUARD:       while.body.preheader:
-; CHECK-PHIGUARD-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-PHIGUARD-NEXT:    [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]])
 ; CHECK-PHIGUARD-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK-PHIGUARD:       while.body:
 ; CHECK-PHIGUARD-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PHIGUARD-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHIGUARD-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
 ; CHECK-PHIGUARD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
 ; CHECK-PHIGUARD-NEXT:    store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
 ; CHECK-PHIGUARD-NEXT:    [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
-; CHECK-PHIGUARD-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
-; CHECK-PHIGUARD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-PHIGUARD-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHIGUARD-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
+; CHECK-PHIGUARD-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-PHIGUARD-NEXT:    br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
 ; CHECK-PHIGUARD:       while.end:
 ; CHECK-PHIGUARD-NEXT:    ret void
 ;
@@ -292,17 +292,17 @@ define void @while_gte(i32 %i, i32 %N, i32* nocapture %A) {
 ; CHECK-PHI:       while.body.preheader:
 ; CHECK-PHI-NEXT:    [[TMP0:%.*]] = add i32 [[I]], 1
 ; CHECK-PHI-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
-; CHECK-PHI-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP1]])
+; CHECK-PHI-NEXT:    [[TMP2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP1]])
 ; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK-PHI:       while.body:
 ; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PHI-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP2]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP4:%.*]], [[WHILE_BODY]] ]
 ; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
 ; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
 ; CHECK-PHI-NEXT:    [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
-; CHECK-PHI-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
-; CHECK-PHI-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
-; CHECK-PHI-NEXT:    br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI-NEXT:    [[TMP4]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1)
+; CHECK-PHI-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP4]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP5]], label [[WHILE_BODY]], label [[WHILE_END]]
 ; CHECK-PHI:       while.end:
 ; CHECK-PHI-NEXT:    ret void
 ;
@@ -351,17 +351,17 @@ define void @while_gte(i32 %i, i32 %N, i32* nocapture %A) {
 ; CHECK-PHIGUARD:       while.body.preheader:
 ; CHECK-PHIGUARD-NEXT:    [[TMP0:%.*]] = add i32 [[I]], 1
 ; CHECK-PHIGUARD-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
-; CHECK-PHIGUARD-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP1]])
+; CHECK-PHIGUARD-NEXT:    [[TMP2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP1]])
 ; CHECK-PHIGUARD-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK-PHIGUARD:       while.body:
 ; CHECK-PHIGUARD-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PHIGUARD-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHIGUARD-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP2]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP4:%.*]], [[WHILE_BODY]] ]
 ; CHECK-PHIGUARD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
 ; CHECK-PHIGUARD-NEXT:    store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
 ; CHECK-PHIGUARD-NEXT:    [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
-; CHECK-PHIGUARD-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
-; CHECK-PHIGUARD-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
-; CHECK-PHIGUARD-NEXT:    br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHIGUARD-NEXT:    [[TMP4]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1)
+; CHECK-PHIGUARD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP4]], 0
+; CHECK-PHIGUARD-NEXT:    br i1 [[TMP5]], label [[WHILE_BODY]], label [[WHILE_END]]
 ; CHECK-PHIGUARD:       while.end:
 ; CHECK-PHIGUARD-NEXT:    ret void
 ;
@@ -424,17 +424,17 @@ define void @while_ne(i32 %N, i32* nocapture %A) {
 ; CHECK-PHI-NEXT:    [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0
 ; CHECK-PHI-NEXT:    br i1 [[CMP]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
 ; CHECK-PHI:       while.body.preheader:
-; CHECK-PHI-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]])
 ; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK-PHI:       while.body:
 ; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PHI-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP1:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
 ; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
 ; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
 ; CHECK-PHI-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
-; CHECK-PHI-NEXT:    [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
-; CHECK-PHI-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-PHI-NEXT:    br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
 ; CHECK-PHI:       while.end:
 ; CHECK-PHI-NEXT:    ret void
 ;
@@ -548,17 +548,17 @@ define void @while_eq(i32 %N, i32* nocapture %A) {
 ; CHECK-PHI-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
 ; CHECK-PHI-NEXT:    br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
 ; CHECK-PHI:       while.body.preheader:
-; CHECK-PHI-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]])
 ; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK-PHI:       while.body:
 ; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PHI-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP1:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
 ; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
 ; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
 ; CHECK-PHI-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
-; CHECK-PHI-NEXT:    [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
-; CHECK-PHI-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-PHI-NEXT:    br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
 ; CHECK-PHI:       while.end:
 ; CHECK-PHI-NEXT:    ret void
 ;
@@ -676,17 +676,17 @@ define void @while_preheader_eq(i32 %N, i32* nocapture %A) {
 ; CHECK-PHI-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
 ; CHECK-PHI-NEXT:    br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
 ; CHECK-PHI:       while.body.preheader:
-; CHECK-PHI-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]])
 ; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK-PHI:       while.body:
 ; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PHI-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP1:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
 ; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
 ; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
 ; CHECK-PHI-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
-; CHECK-PHI-NEXT:    [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
-; CHECK-PHI-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-PHI-NEXT:    br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
 ; CHECK-PHI:       while.end:
 ; CHECK-PHI-NEXT:    ret void
 ;
@@ -820,18 +820,18 @@ define void @nested(i32* nocapture %A, i32 %N) {
 ; CHECK-PHI:       while.cond1.preheader.us:
 ; CHECK-PHI-NEXT:    [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-PHI-NEXT:    [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
-; CHECK-PHI-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]])
 ; CHECK-PHI-NEXT:    br label [[WHILE_BODY3_US:%.*]]
 ; CHECK-PHI:       while.body3.us:
 ; CHECK-PHI-NEXT:    [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
-; CHECK-PHI-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP1:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[WHILE_BODY3_US]] ]
 ; CHECK-PHI-NEXT:    [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
 ; CHECK-PHI-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]]
 ; CHECK-PHI-NEXT:    store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4
 ; CHECK-PHI-NEXT:    [[INC_US]] = add nuw i32 [[J_019_US]], 1
-; CHECK-PHI-NEXT:    [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
-; CHECK-PHI-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-PHI-NEXT:    br i1 [[TMP2]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
 ; CHECK-PHI:       while.cond1.while.end_crit_edge.us:
 ; CHECK-PHI-NEXT:    [[INC6_US]] = add nuw i32 [[I_021_US]], 1
 ; CHECK-PHI-NEXT:    [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]
@@ -897,18 +897,18 @@ define void @nested(i32* nocapture %A, i32 %N) {
 ; CHECK-PHIGUARD:       while.cond1.preheader.us:
 ; CHECK-PHIGUARD-NEXT:    [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-PHIGUARD-NEXT:    [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
-; CHECK-PHIGUARD-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHIGUARD-NEXT:    [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]])
 ; CHECK-PHIGUARD-NEXT:    br label [[WHILE_BODY3_US:%.*]]
 ; CHECK-PHIGUARD:       while.body3.us:
 ; CHECK-PHIGUARD-NEXT:    [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
-; CHECK-PHIGUARD-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP1:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-PHIGUARD-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[WHILE_BODY3_US]] ]
 ; CHECK-PHIGUARD-NEXT:    [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
 ; CHECK-PHIGUARD-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]]
 ; CHECK-PHIGUARD-NEXT:    store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4
 ; CHECK-PHIGUARD-NEXT:    [[INC_US]] = add nuw i32 [[J_019_US]], 1
-; CHECK-PHIGUARD-NEXT:    [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
-; CHECK-PHIGUARD-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-PHIGUARD-NEXT:    br i1 [[TMP2]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-PHIGUARD-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHIGUARD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHIGUARD-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
 ; CHECK-PHIGUARD:       while.cond1.while.end_crit_edge.us:
 ; CHECK-PHIGUARD-NEXT:    [[INC6_US]] = add nuw i32 [[I_021_US]], 1
 ; CHECK-PHIGUARD-NEXT:    [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]