[llvm] 1ddfd1c - [CodeGen][ShrinkWrap] Split restore point

via llvm-commits llvm-commits at lists.llvm.org
Mon May 8 00:52:38 PDT 2023


Author: sgokhale
Date: 2023-05-08T13:21:07+05:30
New Revision: 1ddfd1c8186735c62b642df05c505dc4907ffac4

URL: https://github.com/llvm/llvm-project/commit/1ddfd1c8186735c62b642df05c505dc4907ffac4
DIFF: https://github.com/llvm/llvm-project/commit/1ddfd1c8186735c62b642df05c505dc4907ffac4.diff

LOG: [CodeGen][ShrinkWrap] Split restore point

Try to reland D42600

Differential Revision: https://reviews.llvm.org/D42600

Added: 
    llvm/test/CodeGen/AArch64/shrinkwrap-split-restore-point.mir

Modified: 
    llvm/lib/CodeGen/ShrinkWrap.cpp
    llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
    llvm/test/CodeGen/AArch64/dont-shrink-wrap-stack-mayloadorstore.mir
    llvm/test/CodeGen/AArch64/ragreedy-csr.ll
    llvm/test/CodeGen/AArch64/taildup-cfi.ll
    llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
    llvm/test/CodeGen/ARM/code-placement.ll
    llvm/test/CodeGen/ARM/mbp.ll
    llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
    llvm/test/CodeGen/PowerPC/common-chain-aix32.ll
    llvm/test/CodeGen/PowerPC/common-chain.ll
    llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
    llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll
    llvm/test/CodeGen/PowerPC/shrink-wrap.ll
    llvm/test/CodeGen/PowerPC/shrink-wrap.mir
    llvm/test/CodeGen/RISCV/aext-to-sext.ll
    llvm/test/CodeGen/RISCV/fli-licm.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
    llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
    llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
    llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
    llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
    llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll
    llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
    llvm/test/CodeGen/X86/fold-call-3.ll
    llvm/test/CodeGen/X86/negative-stride-fptosi-user.ll
    llvm/test/CodeGen/X86/pr44412.ll
    llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
    llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
    llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index b219b83bbc2fe..90cba95b7f199 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -98,6 +98,9 @@ STATISTIC(NumCandidatesDropped,
 static cl::opt<cl::boolOrDefault>
 EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden,
                     cl::desc("enable the shrink-wrapping pass"));
+static cl::opt<bool> EnablePostShrinkWrapOpt(
+    "enable-shrink-wrap-region-split", cl::init(true), cl::Hidden,
+    cl::desc("enable splitting of the restore block if possible"));
 
 namespace {
 
@@ -185,6 +188,30 @@ class ShrinkWrap : public MachineFunctionPass {
   /// this call.
   void updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS);
 
+  // Try to find safe point based on dominance and block frequency without
+  // any change in IR.
+  bool performShrinkWrapping(MachineFunction &MF, RegScavenger *RS);
+
+  /// This function tries to split the restore point if doing so can shrink the
+  /// save point further. \return True if restore point is split.
+  bool postShrinkWrapping(bool HasCandidate, MachineFunction &MF,
+                          RegScavenger *RS);
+
+  /// This function analyzes if the restore point can split to create a new
+  /// restore point. This function collects
+  /// 1. Any preds of current restore that are reachable by callee save/FI
+  /// blocks
+  /// - indicated by DirtyPreds
+  /// 2. Any preds of current restore that are not DirtyPreds - indicated by
+  /// CleanPreds
+  /// Both sets should be non-empty for considering restore point split.
+  bool checkIfRestoreSplittable(
+      const MachineBasicBlock *CurRestore,
+      const DenseSet<const MachineBasicBlock *> &ReachableByDirty,
+      SmallVectorImpl<MachineBasicBlock *> &DirtyPreds,
+      SmallVectorImpl<MachineBasicBlock *> &CleanPreds,
+      const TargetInstrInfo *TII, RegScavenger *RS);
+
   /// Initialize the pass for \p MF.
   void init(MachineFunction &MF) {
     RCI.runOnMachineFunction(MF);
@@ -338,18 +365,311 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
 /// Helper function to find the immediate (post) dominator.
 template <typename ListOfBBs, typename DominanceAnalysis>
 static MachineBasicBlock *FindIDom(MachineBasicBlock &Block, ListOfBBs BBs,
-                                   DominanceAnalysis &Dom) {
+                                   DominanceAnalysis &Dom, bool Strict = true) {
   MachineBasicBlock *IDom = &Block;
   for (MachineBasicBlock *BB : BBs) {
     IDom = Dom.findNearestCommonDominator(IDom, BB);
     if (!IDom)
       break;
   }
-  if (IDom == &Block)
+  if (Strict && IDom == &Block)
     return nullptr;
   return IDom;
 }
 
+static bool isAnalyzableBB(const TargetInstrInfo &TII,
+                           MachineBasicBlock &Entry) {
+  // Check if the block is analyzable.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  return !TII.analyzeBranch(Entry, TBB, FBB, Cond);
+}
+
+/// Determines if any predecessor of MBB is on the path from block that has use
+/// or def of CSRs/FI to MBB.
+/// ReachableByDirty: All blocks reachable from block that has use or def of
+/// CSR/FI.
+static bool
+hasDirtyPred(const DenseSet<const MachineBasicBlock *> &ReachableByDirty,
+             const MachineBasicBlock &MBB) {
+  for (const MachineBasicBlock *PredBB : MBB.predecessors())
+    if (ReachableByDirty.count(PredBB))
+      return true;
+  return false;
+}
+
+/// Derives the list of all the basic blocks reachable from MBB.
+static void markAllReachable(DenseSet<const MachineBasicBlock *> &Visited,
+                             const MachineBasicBlock &MBB) {
+  SmallVector<MachineBasicBlock *, 4> Worklist(MBB.succ_begin(),
+                                               MBB.succ_end());
+  Visited.insert(&MBB);
+  while (!Worklist.empty()) {
+    MachineBasicBlock *SuccMBB = Worklist.pop_back_val();
+    if (!Visited.insert(SuccMBB).second)
+      continue;
+    Worklist.append(SuccMBB->succ_begin(), SuccMBB->succ_end());
+  }
+}
+
+/// Collect blocks reachable by use or def of CSRs/FI.
+static void collectBlocksReachableByDirty(
+    const DenseSet<const MachineBasicBlock *> &DirtyBBs,
+    DenseSet<const MachineBasicBlock *> &ReachableByDirty) {
+  for (const MachineBasicBlock *MBB : DirtyBBs) {
+    if (ReachableByDirty.count(MBB))
+      continue;
+    // Mark all offsprings as reachable.
+    markAllReachable(ReachableByDirty, *MBB);
+  }
+}
+
+/// \return true if there is a clean path from SavePoint to the original
+/// Restore.
+static bool
+isSaveReachableThroughClean(const MachineBasicBlock *SavePoint,
+                            ArrayRef<MachineBasicBlock *> CleanPreds) {
+  DenseSet<const MachineBasicBlock *> Visited;
+  SmallVector<MachineBasicBlock *, 4> Worklist(CleanPreds.begin(),
+                                               CleanPreds.end());
+  while (!Worklist.empty()) {
+    MachineBasicBlock *CleanBB = Worklist.pop_back_val();
+    if (CleanBB == SavePoint)
+      return true;
+    if (!Visited.insert(CleanBB).second || !CleanBB->pred_size())
+      continue;
+    Worklist.append(CleanBB->pred_begin(), CleanBB->pred_end());
+  }
+  return false;
+}
+
+/// This function updates the branches post restore point split.
+///
+/// Restore point has been split.
+/// Old restore point: MBB
+/// New restore point: NMBB
+/// Any basic block(say BBToUpdate) which had a fallthrough to MBB
+/// previously should
+/// 1. Fallthrough to NMBB iff NMBB is inserted immediately above MBB in the
+/// block layout OR
+/// 2. Branch unconditionally to NMBB iff NMBB is inserted at any other place.
+static void updateTerminator(MachineBasicBlock *BBToUpdate,
+                             MachineBasicBlock *NMBB,
+                             const TargetInstrInfo *TII) {
+  DebugLoc DL = BBToUpdate->findBranchDebugLoc();
+  // if NMBB isn't the new layout successor for BBToUpdate, insert unconditional
+  // branch to it
+  if (!BBToUpdate->isLayoutSuccessor(NMBB))
+    TII->insertUnconditionalBranch(*BBToUpdate, NMBB, DL);
+}
+
+/// This function splits the restore point and returns new restore point/BB.
+///
+/// DirtyPreds: Predessors of \p MBB that are ReachableByDirty
+///
+/// Decision has been made to split the restore point.
+/// old restore point: \p MBB
+/// new restore point: \p NMBB
+/// This function makes the necessary block layout changes so that
+/// 1. \p NMBB points to \p MBB unconditionally
+/// 2. All dirtyPreds that previously pointed to \p MBB point to \p NMBB
+static MachineBasicBlock *
+tryToSplitRestore(MachineBasicBlock *MBB,
+                  ArrayRef<MachineBasicBlock *> DirtyPreds,
+                  const TargetInstrInfo *TII) {
+  MachineFunction *MF = MBB->getParent();
+
+  // get the list of DirtyPreds who have a fallthrough to MBB
+  // before the block layout change. This is just to ensure that if the NMBB is
+  // inserted after MBB, then we create unconditional branch from
+  // DirtyPred/CleanPred to NMBB
+  SmallPtrSet<MachineBasicBlock *, 8> MBBFallthrough;
+  for (MachineBasicBlock *BB : DirtyPreds)
+    if (BB->getFallThrough(false) == MBB)
+      MBBFallthrough.insert(BB);
+
+  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
+  // Insert this block at the end of the function. Inserting in between may
+  // interfere with control flow optimizer decisions.
+  MF->insert(MF->end(), NMBB);
+
+  for (const MachineBasicBlock::RegisterMaskPair &LI : MBB->liveins())
+    NMBB->addLiveIn(LI.PhysReg);
+
+  TII->insertUnconditionalBranch(*NMBB, MBB, DebugLoc());
+
+  // After splitting, all predecessors of the restore point should be dirty
+  // blocks.
+  for (MachineBasicBlock *SuccBB : DirtyPreds)
+    SuccBB->ReplaceUsesOfBlockWith(MBB, NMBB);
+
+  NMBB->addSuccessor(MBB);
+
+  for (MachineBasicBlock *BBToUpdate : MBBFallthrough)
+    updateTerminator(BBToUpdate, NMBB, TII);
+
+  return NMBB;
+}
+
+/// This function undoes the restore point split done earlier.
+///
+/// DirtyPreds: All predecessors of \p NMBB that are ReachableByDirty.
+///
+/// Restore point was split and the change needs to be unrolled. Make necessary
+/// changes to reset restore point from \p NMBB to \p MBB.
+static void rollbackRestoreSplit(MachineFunction &MF, MachineBasicBlock *NMBB,
+                                 MachineBasicBlock *MBB,
+                                 ArrayRef<MachineBasicBlock *> DirtyPreds,
+                                 const TargetInstrInfo *TII) {
+  // For a BB, if NMBB is fallthrough in the current layout, then in the new
+  // layout a. BB should fallthrough to MBB OR b. BB should undconditionally
+  // branch to MBB
+  SmallPtrSet<MachineBasicBlock *, 8> NMBBFallthrough;
+  for (MachineBasicBlock *BB : DirtyPreds)
+    if (BB->getFallThrough(false) == NMBB)
+      NMBBFallthrough.insert(BB);
+
+  NMBB->removeSuccessor(MBB);
+  for (MachineBasicBlock *SuccBB : DirtyPreds)
+    SuccBB->ReplaceUsesOfBlockWith(NMBB, MBB);
+
+  NMBB->erase(NMBB->begin(), NMBB->end());
+  NMBB->eraseFromParent();
+
+  for (MachineBasicBlock *BBToUpdate : NMBBFallthrough)
+    updateTerminator(BBToUpdate, MBB, TII);
+}
+
+// A block is deemed fit for restore point split iff there exist
+// 1. DirtyPreds - preds of CurRestore reachable from use or def of CSR/FI
+// 2. CleanPreds - preds of CurRestore that arent DirtyPreds
+bool ShrinkWrap::checkIfRestoreSplittable(
+    const MachineBasicBlock *CurRestore,
+    const DenseSet<const MachineBasicBlock *> &ReachableByDirty,
+    SmallVectorImpl<MachineBasicBlock *> &DirtyPreds,
+    SmallVectorImpl<MachineBasicBlock *> &CleanPreds,
+    const TargetInstrInfo *TII, RegScavenger *RS) {
+  for (const MachineInstr &MI : *CurRestore)
+    if (useOrDefCSROrFI(MI, RS))
+      return false;
+
+  for (MachineBasicBlock *PredBB : CurRestore->predecessors()) {
+    if (!isAnalyzableBB(*TII, *PredBB))
+      return false;
+
+    if (ReachableByDirty.count(PredBB))
+      DirtyPreds.push_back(PredBB);
+    else
+      CleanPreds.push_back(PredBB);
+  }
+
+  return !(CleanPreds.empty() || DirtyPreds.empty());
+}
+
+bool ShrinkWrap::postShrinkWrapping(bool HasCandidate, MachineFunction &MF,
+                                    RegScavenger *RS) {
+  if (!EnablePostShrinkWrapOpt)
+    return false;
+
+  MachineBasicBlock *InitSave = nullptr;
+  MachineBasicBlock *InitRestore = nullptr;
+
+  if (HasCandidate) {
+    InitSave = Save;
+    InitRestore = Restore;
+  } else {
+    InitRestore = nullptr;
+    InitSave = &MF.front();
+    for (MachineBasicBlock &MBB : MF) {
+      if (MBB.isEHFuncletEntry())
+        return false;
+      if (MBB.isReturnBlock()) {
+        // Do not support multiple restore points.
+        if (InitRestore)
+          return false;
+        InitRestore = &MBB;
+      }
+    }
+  }
+
+  if (!InitSave || !InitRestore || InitRestore == InitSave ||
+      !MDT->dominates(InitSave, InitRestore) ||
+      !MPDT->dominates(InitRestore, InitSave))
+    return false;
+
+  // Bail out of the optimization if any of the basic block is target of
+  // INLINEASM_BR instruction
+  for (MachineBasicBlock &MBB : MF)
+    if (MBB.isInlineAsmBrIndirectTarget())
+      return false;
+
+  DenseSet<const MachineBasicBlock *> DirtyBBs;
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.isEHPad()) {
+      DirtyBBs.insert(&MBB);
+      continue;
+    }
+    for (const MachineInstr &MI : MBB)
+      if (useOrDefCSROrFI(MI, RS)) {
+        DirtyBBs.insert(&MBB);
+        break;
+      }
+  }
+
+  // Find blocks reachable from the use or def of CSRs/FI.
+  DenseSet<const MachineBasicBlock *> ReachableByDirty;
+  collectBlocksReachableByDirty(DirtyBBs, ReachableByDirty);
+
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<MachineBasicBlock *, 2> DirtyPreds;
+  SmallVector<MachineBasicBlock *, 2> CleanPreds;
+  if (!checkIfRestoreSplittable(InitRestore, ReachableByDirty, DirtyPreds,
+                                CleanPreds, TII, RS))
+    return false;
+
+  // Trying to reach out to the new save point which dominates all dirty blocks.
+  MachineBasicBlock *NewSave =
+      FindIDom<>(**DirtyPreds.begin(), DirtyPreds, *MDT, false);
+
+  while (NewSave && (hasDirtyPred(ReachableByDirty, *NewSave) ||
+                     EntryFreq < MBFI->getBlockFreq(NewSave).getFrequency()))
+    NewSave = FindIDom<>(**NewSave->pred_begin(), NewSave->predecessors(), *MDT,
+                         false);
+
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  if (!NewSave || NewSave == InitSave ||
+      isSaveReachableThroughClean(NewSave, CleanPreds) ||
+      !TFI->canUseAsPrologue(*NewSave))
+    return false;
+
+  // Now we know that splitting a restore point can isolate the restore point
+  // from clean blocks and doing so can shrink the save point.
+  MachineBasicBlock *NewRestore =
+      tryToSplitRestore(InitRestore, DirtyPreds, TII);
+
+  // Make sure if the new restore point is valid as an epilogue, depending on
+  // targets.
+  if (!TFI->canUseAsEpilogue(*NewRestore)) {
+    rollbackRestoreSplit(MF, NewRestore, InitRestore, DirtyPreds, TII);
+    return false;
+  }
+
+  Save = NewSave;
+  Restore = NewRestore;
+
+  MDT->runOnMachineFunction(MF);
+  MPDT->runOnMachineFunction(MF);
+
+  assert((MDT->dominates(Save, Restore) && MPDT->dominates(Restore, Save)) &&
+         "Incorrect save or restore point due to dominance relations");
+  assert((!MLI->getLoopFor(Save) && !MLI->getLoopFor(Restore)) &&
+         "Unexpected save or restore point in a loop");
+  assert((EntryFreq >= MBFI->getBlockFreq(Save).getFrequency() &&
+          EntryFreq >= MBFI->getBlockFreq(Restore).getFrequency()) &&
+         "Incorrect save or restore point based on block frequency");
+  return true;
+}
+
 void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
                                          RegScavenger *RS) {
   // Get rid of the easy cases first.
@@ -481,31 +801,7 @@ static bool giveUpWithRemarks(MachineOptimizationRemarkEmitter *ORE,
   return false;
 }
 
-bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF))
-    return false;
-
-  LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
-
-  init(MF);
-
-  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
-  if (containsIrreducibleCFG<MachineBasicBlock *>(RPOT, *MLI)) {
-    // If MF is irreducible, a block may be in a loop without
-    // MachineLoopInfo reporting it. I.e., we may use the
-    // post-dominance property in loops, which lead to incorrect
-    // results. Moreover, we may miss that the prologue and
-    // epilogue are not in the same loop, leading to unbalanced
-    // construction/deconstruction of the stack frame.
-    return giveUpWithRemarks(ORE, "UnsupportedIrreducibleCFG",
-                             "Irreducible CFGs are not supported yet.",
-                             MF.getFunction().getSubprogram(), &MF.front());
-  }
-
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  std::unique_ptr<RegScavenger> RS(
-      TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr);
-
+bool ShrinkWrap::performShrinkWrapping(MachineFunction &MF, RegScavenger *RS) {
   for (MachineBasicBlock &MBB : MF) {
     LLVM_DEBUG(dbgs() << "Look into: " << MBB.getNumber() << ' '
                       << MBB.getName() << '\n');
@@ -521,7 +817,7 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
       // are at least at the boundary of the save and restore points.  The
       // problem is that a basic block can jump out from the middle in these
       // cases, which we do not handle.
-      updateSaveRestorePoints(MBB, RS.get());
+      updateSaveRestorePoints(MBB, RS);
       if (!ArePointsInteresting()) {
         LLVM_DEBUG(dbgs() << "EHPad/inlineasm_br prevents shrink-wrapping\n");
         return false;
@@ -530,11 +826,11 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
     }
 
     for (const MachineInstr &MI : MBB) {
-      if (!useOrDefCSROrFI(MI, RS.get()))
+      if (!useOrDefCSROrFI(MI, RS))
         continue;
       // Save (resp. restore) point must dominate (resp. post dominate)
       // MI. Look for the proper basic block for those.
-      updateSaveRestorePoints(MBB, RS.get());
+      updateSaveRestorePoints(MBB, RS);
       // If we are at a point where we cannot improve the placement of
       // save/restore instructions, just give up.
       if (!ArePointsInteresting()) {
@@ -588,13 +884,49 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
         break;
       NewBB = Restore;
     }
-    updateSaveRestorePoints(*NewBB, RS.get());
+    updateSaveRestorePoints(*NewBB, RS);
   } while (Save && Restore);
 
   if (!ArePointsInteresting()) {
     ++NumCandidatesDropped;
     return false;
   }
+  return true;
+}
+
+bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+
+  init(MF);
+
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+  if (containsIrreducibleCFG<MachineBasicBlock *>(RPOT, *MLI)) {
+    // If MF is irreducible, a block may be in a loop without
+    // MachineLoopInfo reporting it. I.e., we may use the
+    // post-dominance property in loops, which lead to incorrect
+    // results. Moreover, we may miss that the prologue and
+    // epilogue are not in the same loop, leading to unbalanced
+    // construction/deconstruction of the stack frame.
+    return giveUpWithRemarks(ORE, "UnsupportedIrreducibleCFG",
+                             "Irreducible CFGs are not supported yet.",
+                             MF.getFunction().getSubprogram(), &MF.front());
+  }
+
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  std::unique_ptr<RegScavenger> RS(
+      TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr);
+
+  bool Changed = false;
+
+  bool HasCandidate = performShrinkWrapping(MF, RS.get());
+  Changed = postShrinkWrapping(HasCandidate, MF, RS.get());
+  if (!HasCandidate && !Changed)
+    return false;
+  if (!ArePointsInteresting())
+    return Changed;
 
   LLVM_DEBUG(dbgs() << "Final shrink wrap candidates:\nSave: "
                     << Save->getNumber() << ' ' << Save->getName()
@@ -605,7 +937,7 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
   MFI.setSavePoint(Save);
   MFI.setRestorePoint(Restore);
   ++NumCandidates;
-  return false;
+  return Changed;
 }
 
 bool ShrinkWrap::isShrinkWrapEnabled(const MachineFunction &MF) {

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 49a15528c041a..8dd4da1ee4401 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -424,8 +424,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A,
 ; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    b .LBB5_7
 ; CHECK-NEXT:  .LBB5_3:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB5_9
 ; CHECK-NEXT:  .LBB5_4: // %vector.ph
 ; CHECK-NEXT:    and x11, x10, #0xfffffff0
 ; CHECK-NEXT:    add x8, x0, #8

diff  --git a/llvm/test/CodeGen/AArch64/dont-shrink-wrap-stack-mayloadorstore.mir b/llvm/test/CodeGen/AArch64/dont-shrink-wrap-stack-mayloadorstore.mir
index bc60b7b571197..34fafb750083c 100644
--- a/llvm/test/CodeGen/AArch64/dont-shrink-wrap-stack-mayloadorstore.mir
+++ b/llvm/test/CodeGen/AArch64/dont-shrink-wrap-stack-mayloadorstore.mir
@@ -6,8 +6,8 @@
  ; RUN: llc -x=mir -simplify-mir -run-pass=shrink-wrap -o - %s | FileCheck %s
  ; CHECK:      name:            compiler_pop_stack
  ; CHECK:      frameInfo:       
- ; CHECK-NOT:  savePoint:
- ; CHECK-NOT:  restorePoint:
+ ; CHECK:      savePoint:       '%bb.1'
+ ; CHECK-NEXT: restorePoint:    '%bb.7'
  ; CHECK:      stack:
  ; CHECK:      name:            f
  ; CHECK:      frameInfo:       

diff  --git a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
index 98c95c38bbb6b..99f01883dbfb1 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -21,16 +21,16 @@ declare i32 @__maskrune(i32, i64) #7
 define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly %b) #9 {
 ; CHECK-LABEL: prune_match:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne LBB0_47
+; CHECK-NEXT:  ; %bb.1: ; %if.end
 ; CHECK-NEXT:    sub sp, sp, #64
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne LBB0_42
-; CHECK-NEXT:  ; %bb.1: ; %if.end
 ; CHECK-NEXT:  Lloh0:
 ; CHECK-NEXT:    adrp x14, __DefaultRuneLocale at GOTPAGE
 ; CHECK-NEXT:    mov x9, xzr
@@ -243,7 +243,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    b.eq LBB0_37
 ; CHECK-NEXT:  LBB0_42:
 ; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:  LBB0_43: ; %return
+; CHECK-NEXT:  LBB0_43:
 ; CHECK-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
@@ -259,6 +259,12 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:  ; %bb.46: ; %land.lhs.true52
 ; CHECK-NEXT:    cbz w8, LBB0_43
 ; CHECK-NEXT:    b LBB0_12
+; CHECK-NEXT:  LBB0_47:
+; CHECK-NEXT:    .cfi_def_cfa wsp, 0
+; CHECK-NEXT:    .cfi_same_value w30
+; CHECK-NEXT:    .cfi_same_value w29
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdrGot Lloh0, Lloh1
 ; CHECK-NEXT:    .loh AdrpLdrGot Lloh2, Lloh3
 ; CHECK-NEXT:    .loh AdrpLdrGot Lloh4, Lloh5

diff  --git a/llvm/test/CodeGen/AArch64/shrinkwrap-split-restore-point.mir b/llvm/test/CodeGen/AArch64/shrinkwrap-split-restore-point.mir
new file mode 100644
index 0000000000000..5b43dde0ae250
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shrinkwrap-split-restore-point.mir
@@ -0,0 +1,760 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=aarch64 -run-pass=shrink-wrap -o - %s | FileCheck %s
+
+--- |
+  define void @shrink_test1(i32 %a) {
+  entry:
+    %cmp5 = icmp sgt i32 %a, 0
+    br i1 %cmp5, label %BB0, label %exit
+
+  BB0:                                              ; preds = %entry
+    %call = call i32 @fun()
+    %c = icmp eq i32 %call, 0
+    br i1 %c, label %BB1, label %exit
+
+  BB1:                                              ; preds = %BB0
+    %call2 = call i32 @fun()
+    br label %exit
+
+  exit:                                             ; preds = %BB1, %BB0, %entry
+    ret void
+  }
+
+  define void @shrink_test2(i32 %a, ptr %P1, ptr %P2) {
+  BB00:
+    %cmp5 = icmp sgt i32 %a, 0
+    br i1 %cmp5, label %BB01, label %exit
+
+  BB01:                                             ; preds = %BB00
+    store i32 %a, ptr %P1, align 4
+    %c1 = icmp sgt i32 %a, 1
+    br i1 %c1, label %BB02, label %BB03
+
+  BB02:                                             ; preds = %BB01
+    store i32 %a, ptr %P2, align 4
+    br label %BB03
+
+  BB03:                                             ; preds = %BB02, %BB01
+    %call03 = call i32 @fun()
+    %c03 = icmp eq i32 %call03, 0
+    br i1 %c03, label %BB04, label %BB05
+
+  BB04:                                             ; preds = %BB03
+    %call04 = call i32 @fun()
+    br label %BB05
+
+  BB05:                                             ; preds = %BB04, %BB03
+    %call05 = call i32 @fun()
+    %c05 = icmp eq i32 %call05, 0
+    br i1 %c05, label %BB06, label %BB07
+
+  BB06:                                             ; preds = %BB05
+    %call06 = call i32 @fun()
+    br label %exit
+
+  BB07:                                             ; preds = %BB05
+    %call07 = call i32 @fun2()
+    br label %exit
+
+  exit:                                             ; preds = %BB07, %BB06, %BB00
+    ret void
+  }
+
+  define void @noshrink_test1(i32 %a, i32 %v, i32 %v2) {
+  entry:
+    %cmp5 = icmp sgt i32 %a, 0
+    br i1 %cmp5, label %BB0, label %exit
+
+  BB0:                                              ; preds = %entry
+    %c = icmp eq i32 %a, 10
+    %c1 = icmp eq i32 %v, 10
+    %or.cond = select i1 %c, i1 %c1, i1 false
+    br i1 %or.cond, label %BB3, label %BB2
+
+  BB2:                                              ; preds = %BB0
+    %c2 = icmp eq i32 %v2, 10
+    br i1 %c2, label %BB4, label %exit
+
+  BB3:                                              ; preds = %BB0
+    %call3 = call i32 @fun()
+    br label %exit
+
+  BB4:                                              ; preds = %BB2
+    %call4 = call i32 @fun2()
+    br label %exit
+
+  exit:                                             ; preds = %BB4, %BB3, %BB2, %entry
+    ret void
+  }
+
+  define void @noshrink_test2(i32 %a) {
+  BB00:
+    %cmp5 = icmp sgt i32 %a, 0
+    br i1 %cmp5, label %BB01, label %InfLoop.preheader
+
+  InfLoop.preheader:                                ; preds = %BB00
+    br label %InfLoop
+
+  BB01:                                             ; preds = %BB00
+    %call = call i32 @fun()
+    %c = icmp eq i32 %call, 0
+    br i1 %c, label %BB02, label %exit
+
+  BB02:                                             ; preds = %BB01
+    %call2 = call i32 @fun()
+    br label %exit
+
+  InfLoop:                                          ; preds = %InfLoop.preheader, %InfLoop
+    %call3 = call i32 @fun()
+    br label %InfLoop
+
+  exit:                                             ; preds = %BB02, %BB01
+    ret void
+  }
+
+  define void @noshrink_test3(i32 %a) {
+  BB00:
+    %cmp5 = icmp sgt i32 %a, 0
+    %call02 = call i32 @fun()
+    br i1 %cmp5, label %BB02, label %BB01
+
+  BB01:                                             ; preds = %BB00
+    %0 = icmp eq i32 %call02, 0
+    br i1 %0, label %BB01.1, label %exit
+
+  BB01.1:                                           ; preds = %BB01
+    call void @abort() #0
+    unreachable
+
+  BB02:                                             ; preds = %BB00
+    %1 = icmp eq i32 %call02, 0
+    br i1 %1, label %BB03, label %BB04
+
+  BB03:                                             ; preds = %BB02
+    %call03 = call i32 @fun()
+    %c03 = icmp eq i32 %call03, 0
+    br i1 %c03, label %BB04, label %exit
+
+  BB04:                                             ; preds = %BB03, %BB02
+    %call04 = call i32 @fun()
+    br label %exit
+
+  exit:                                             ; preds = %BB04, %BB03, %BB01
+    ret void
+  }
+
+  define void @noshrink_bb_as_inlineasmbr_target(i1 %cond) {
+  entry:
+    br i1 %cond, label %0, label %exit
+
+  0:                                                ; preds = %entry
+    callbr void asm sideeffect "", "!i,~{flags}"()
+      to label %1 [label %exit]
+
+  1:                                                ; preds = %0
+    call void @dosomething()
+    br label %exit
+
+  exit:                                             ; preds = %1, %0, %entry
+    ret void
+  }
+
+  declare i32 @fun()
+  declare i32 @fun2()
+  declare void @abort()
+  declare void @dosomething()
+...
+---
+name:            shrink_test1
+alignment:       4
+tracksRegLiveness: true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  maxAlignment:    1
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: shrink_test1
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x50000000), %bb.3(0x30000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri killed renamable $w0, 1, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 11, %bb.3, implicit killed $nzcv
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.BB0:
+  ; CHECK-NEXT:   successors: %bb.2(0x30000000), %bb.4(0x50000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   CBNZW killed renamable $w0, %bb.4
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.BB1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.exit:
+  ; CHECK-NEXT:   RET_ReallyLR
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.3
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $w0
+
+    dead $wzr = SUBSWri killed renamable $w0, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.3, implicit killed $nzcv
+    B %bb.1
+
+  bb.1.BB0:
+    successors: %bb.2(0x30000000), %bb.3(0x50000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    CBNZW killed renamable $w0, %bb.3
+    B %bb.2
+
+  bb.2.BB1:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+  bb.3.exit:
+    RET_ReallyLR
+
+...
+---
+name:            shrink_test2
+alignment:       4
+tracksRegLiveness: true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$w0' }
+  - { reg: '$x1' }
+  - { reg: '$x2' }
+frameInfo:
+  maxAlignment:    1
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: shrink_test2
+  ; CHECK: bb.0.BB00:
+  ; CHECK-NEXT:   successors: %bb.1(0x50000000), %bb.8(0x30000000)
+  ; CHECK-NEXT:   liveins: $w0, $x1, $x2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri renamable $w0, 1, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 11, %bb.8, implicit killed $nzcv
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.BB01:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0, $x1, $x2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri renamable $w0, 2, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   STRWui renamable $w0, killed renamable $x1, 0 :: (store (s32) into %ir.P1)
+  ; CHECK-NEXT:   Bcc 11, %bb.3, implicit killed $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.BB02:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $x2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   STRWui killed renamable $w0, killed renamable $x2, 0 :: (store (s32) into %ir.P2)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.BB03:
+  ; CHECK-NEXT:   successors: %bb.4(0x30000000), %bb.5(0x50000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   CBNZW killed renamable $w0, %bb.5
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.BB04:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.BB05:
+  ; CHECK-NEXT:   successors: %bb.6(0x30000000), %bb.7(0x50000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   CBNZW killed renamable $w0, %bb.7
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.BB06:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   B %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.BB07:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun2, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   B %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8.exit:
+  ; CHECK-NEXT:   RET_ReallyLR
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.8
+  bb.0.BB00:
+    successors: %bb.1(0x50000000), %bb.8(0x30000000)
+    liveins: $w0, $x1, $x2
+
+    dead $wzr = SUBSWri renamable $w0, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.8, implicit killed $nzcv
+    B %bb.1
+
+  bb.1.BB01:
+    successors: %bb.2, %bb.3
+    liveins: $w0, $x1, $x2
+
+    dead $wzr = SUBSWri renamable $w0, 2, 0, implicit-def $nzcv
+    STRWui renamable $w0, killed renamable $x1, 0 :: (store (s32) into %ir.P1)
+    Bcc 11, %bb.3, implicit killed $nzcv
+    B %bb.2
+
+  bb.2.BB02:
+    liveins: $w0, $x2
+
+    STRWui killed renamable $w0, killed renamable $x2, 0 :: (store (s32) into %ir.P2)
+
+  bb.3.BB03:
+    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    CBNZW killed renamable $w0, %bb.5
+    B %bb.4
+
+  bb.4.BB04:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+  bb.5.BB05:
+    successors: %bb.6(0x30000000), %bb.7(0x50000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    CBNZW killed renamable $w0, %bb.7
+    B %bb.6
+
+  bb.6.BB06:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    B %bb.8
+
+  bb.7.BB07:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun2, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+  bb.8.exit:
+    RET_ReallyLR
+
+...
+---
+name:            noshrink_test1
+alignment:       4
+tracksRegLiveness: true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$w0' }
+  - { reg: '$w1' }
+  - { reg: '$w2' }
+frameInfo:
+  maxAlignment:    1
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: noshrink_test1
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x50000000), %bb.6(0x30000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1, $w2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri renamable $w0, 1, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 11, %bb.6, implicit killed $nzcv
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.BB0:
+  ; CHECK-NEXT:   successors: %bb.2(0x60000000), %bb.3(0x20000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1, $w2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri killed renamable $w0, 10, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.BB0:
+  ; CHECK-NEXT:   successors: %bb.4(0x55555555), %bb.3(0x2aaaaaab)
+  ; CHECK-NEXT:   liveins: $w1, $w2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri killed renamable $w1, 10, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit killed $nzcv
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.BB2:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT:   liveins: $w2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri killed renamable $w2, 10, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit killed $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.BB3:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.BB4:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun2, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.exit:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.6(0x30000000)
+    liveins: $w0, $w1, $w2
+
+    dead $wzr = SUBSWri renamable $w0, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.6, implicit killed $nzcv
+    B %bb.1
+
+  bb.1.BB0:
+    successors: %bb.2(0x60000000), %bb.3(0x20000000)
+    liveins: $w0, $w1, $w2
+
+    dead $wzr = SUBSWri killed renamable $w0, 10, 0, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit killed $nzcv
+    B %bb.2
+
+  bb.2.BB0:
+    successors: %bb.4(0x55555555), %bb.3(0x2aaaaaab)
+    liveins: $w1, $w2
+
+    dead $wzr = SUBSWri killed renamable $w1, 10, 0, implicit-def $nzcv
+    Bcc 0, %bb.4, implicit killed $nzcv
+    B %bb.3
+
+  bb.3.BB2:
+    liveins: $w2
+
+    dead $wzr = SUBSWri killed renamable $w2, 10, 0, implicit-def $nzcv
+    Bcc 0, %bb.5, implicit killed $nzcv
+    B %bb.6
+
+  bb.4.BB3:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    B %bb.6
+
+  bb.5.BB4:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun2, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+  bb.6.exit:
+    RET_ReallyLR
+
+...
+---
+name:            noshrink_test2
+alignment:       4
+tracksRegLiveness: true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  maxAlignment:    1
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: noshrink_test2
+  ; CHECK: bb.0.BB00:
+  ; CHECK-NEXT:   successors: %bb.2(0x50000000), %bb.1(0x30000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri killed renamable $w0, 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 12, %bb.2, implicit killed $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.BB01:
+  ; CHECK-NEXT:   successors: %bb.3(0x30000000), %bb.5(0x50000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   CBNZW killed renamable $w0, %bb.5
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.BB02:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   B %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.InfLoop:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.exit:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0.BB00:
+    successors: %bb.2(0x50000000), %bb.1(0x30000000)
+    liveins: $w0
+
+    dead $wzr = SUBSWri killed renamable $w0, 0, 0, implicit-def $nzcv
+    Bcc 12, %bb.2, implicit killed $nzcv
+
+  bb.1:
+    B %bb.4
+
+  bb.2.BB01:
+    successors: %bb.3(0x30000000), %bb.5(0x50000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    CBNZW killed renamable $w0, %bb.5
+    B %bb.3
+
+  bb.3.BB02:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    B %bb.5
+
+  bb.4.InfLoop:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    B %bb.4
+
+  bb.5.exit:
+    RET_ReallyLR
+
+...
+---
+name:            noshrink_test3
+alignment:       4
+tracksRegLiveness: true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  maxAlignment:    1
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: noshrink_test3
+  ; CHECK: bb.0.BB00:
+  ; CHECK-NEXT:   successors: %bb.3(0x50000000), %bb.1(0x30000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w19 = COPY $w0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   dead $wzr = SUBSWri killed renamable $w19, 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 12, %bb.3, implicit killed $nzcv
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.BB01:
+  ; CHECK-NEXT:   successors: %bb.2(0x00000800), %bb.6(0x7ffff800)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBNZW killed renamable $w0, %bb.6
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.BB01.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @abort, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.BB02:
+  ; CHECK-NEXT:   successors: %bb.4(0x30000000), %bb.5(0x50000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBNZW killed renamable $w0, %bb.5
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.BB03:
+  ; CHECK-NEXT:   successors: %bb.5(0x30000000), %bb.6(0x50000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   CBNZW killed renamable $w0, %bb.6
+  ; CHECK-NEXT:   B %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.BB04:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.exit:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0.BB00:
+    successors: %bb.3(0x50000000), %bb.1(0x30000000)
+    liveins: $w0
+
+    renamable $w19 = COPY $w0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    dead $wzr = SUBSWri killed renamable $w19, 0, 0, implicit-def $nzcv
+    Bcc 12, %bb.3, implicit killed $nzcv
+    B %bb.1
+
+  bb.1.BB01:
+    successors: %bb.2(0x00000800), %bb.6(0x7ffff800)
+    liveins: $w0
+
+    CBNZW killed renamable $w0, %bb.6
+    B %bb.2
+
+  bb.2.BB01.1:
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @abort, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+  bb.3.BB02:
+    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+    liveins: $w0
+
+    CBNZW killed renamable $w0, %bb.5
+    B %bb.4
+
+  bb.4.BB03:
+    successors: %bb.5(0x30000000), %bb.6(0x50000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    CBNZW killed renamable $w0, %bb.6
+    B %bb.5
+
+  bb.5.BB04:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @fun, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+  bb.6.exit:
+    RET_ReallyLR
+
+...
+---
+name:            noshrink_bb_as_inlineasmbr_target
+registers:       []
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  savePoint:       ''
+  restorePoint:    ''
+body:             |
+  ; CHECK-LABEL: name: noshrink_bb_as_inlineasmbr_target
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   TBZW killed renamable $w0, 0, %bb.3
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.3(0x00000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   INLINEASM_BR &"", 1 /* sideeffect attdialect */, 13 /* imm */, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2 (%ir-block.1):
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   BL @dosomething, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.exit (machine-block-address-taken, inlineasm-br-indirect-target):
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    liveins: $w0
+
+    TBZW killed renamable $w0, 0, %bb.3
+    B %bb.1
+
+  bb.1 (%ir-block.0):
+    successors: %bb.2(0x80000000), %bb.3(0x00000000)
+
+    INLINEASM_BR &"", 1 /* sideeffect attdialect */, 13 /* imm */, %bb.3
+    B %bb.2
+
+  bb.2 (%ir-block.1):
+    successors: %bb.3(0x80000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @dosomething, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+  bb.3.exit (machine-block-address-taken, inlineasm-br-indirect-target):
+    RET_ReallyLR
+
+...

diff  --git a/llvm/test/CodeGen/AArch64/taildup-cfi.ll b/llvm/test/CodeGen/AArch64/taildup-cfi.ll
index 221503009cdb6..4a87ceefbcf03 100644
--- a/llvm/test/CodeGen/AArch64/taildup-cfi.ll
+++ b/llvm/test/CodeGen/AArch64/taildup-cfi.ll
@@ -32,7 +32,7 @@ if.then:                                          ; preds = %entry
   store i32 0, ptr @f, align 4, !tbaa !2
   br label %if.end
 
-; DARWIN-NOT:           Merging into block
+; DARWIN:             Merging into block
 ; LINUX:    	      Merging into block
 
 if.end:                                           ; preds = %entry.if.end_crit_edge, %if.then

diff  --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
index 050696ad653eb..e45985136cf34 100644
--- a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
@@ -5,11 +5,11 @@
 define i32 @add_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture readonly %arg2, ptr nocapture readonly %arg3) {
 ; CHECK-LE-LABEL: add_user:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, lr}
-; CHECK-LE-NEXT:    push {r4, lr}
 ; CHECK-LE-NEXT:    cmp r0, #1
 ; CHECK-LE-NEXT:    blt .LBB0_4
 ; CHECK-LE-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-LE-NEXT:    .save {r4, lr}
+; CHECK-LE-NEXT:    push {r4, lr}
 ; CHECK-LE-NEXT:    sub.w lr, r3, #2
 ; CHECK-LE-NEXT:    subs r2, #2
 ; CHECK-LE-NEXT:    mov.w r12, #0
@@ -22,22 +22,23 @@ define i32 @add_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture reado
 ; CHECK-LE-NEXT:    sxtah r1, r1, r3
 ; CHECK-LE-NEXT:    smlad r12, r4, r3, r12
 ; CHECK-LE-NEXT:    bne .LBB0_2
-; CHECK-LE-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-LE-NEXT:  @ %bb.3:
+; CHECK-LE-NEXT:    pop.w {r4, lr}
 ; CHECK-LE-NEXT:    add.w r0, r12, r1
-; CHECK-LE-NEXT:    pop {r4, pc}
+; CHECK-LE-NEXT:    bx lr
 ; CHECK-LE-NEXT:  .LBB0_4:
 ; CHECK-LE-NEXT:    mov.w r12, #0
 ; CHECK-LE-NEXT:    movs r1, #0
 ; CHECK-LE-NEXT:    add.w r0, r12, r1
-; CHECK-LE-NEXT:    pop {r4, pc}
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: add_user:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    cmp r0, #1
 ; CHECK-BE-NEXT:    blt .LBB0_4
 ; CHECK-BE-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-BE-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    subs r3, #2
 ; CHECK-BE-NEXT:    subs r2, #2
 ; CHECK-BE-NEXT:    mov.w r12, #0
@@ -53,14 +54,15 @@ define i32 @add_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture reado
 ; CHECK-BE-NEXT:    ldrsh.w r4, [r3, #2]
 ; CHECK-BE-NEXT:    smlabb r12, r5, r4, r12
 ; CHECK-BE-NEXT:    bne .LBB0_2
-; CHECK-BE-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-BE-NEXT:  @ %bb.3:
+; CHECK-BE-NEXT:    pop.w {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 ; CHECK-BE-NEXT:  .LBB0_4:
 ; CHECK-BE-NEXT:    mov.w r12, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %cmp24 = icmp sgt i32 %arg, 0
   br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
@@ -105,11 +107,11 @@ for.body:
 define i32 @mul_bottom_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture readonly %arg2, ptr nocapture readonly %arg3) {
 ; CHECK-LE-LABEL: mul_bottom_user:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, lr}
-; CHECK-LE-NEXT:    push {r4, lr}
 ; CHECK-LE-NEXT:    cmp r0, #1
 ; CHECK-LE-NEXT:    blt .LBB1_4
 ; CHECK-LE-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-LE-NEXT:    .save {r4, lr}
+; CHECK-LE-NEXT:    push {r4, lr}
 ; CHECK-LE-NEXT:    sub.w lr, r3, #2
 ; CHECK-LE-NEXT:    subs r2, #2
 ; CHECK-LE-NEXT:    mov.w r12, #0
@@ -123,22 +125,23 @@ define i32 @mul_bottom_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocaptur
 ; CHECK-LE-NEXT:    sxth r3, r3
 ; CHECK-LE-NEXT:    mul r1, r3, r1
 ; CHECK-LE-NEXT:    bne .LBB1_2
-; CHECK-LE-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-LE-NEXT:  @ %bb.3:
+; CHECK-LE-NEXT:    pop.w {r4, lr}
 ; CHECK-LE-NEXT:    add.w r0, r12, r1
-; CHECK-LE-NEXT:    pop {r4, pc}
+; CHECK-LE-NEXT:    bx lr
 ; CHECK-LE-NEXT:  .LBB1_4:
 ; CHECK-LE-NEXT:    mov.w r12, #0
 ; CHECK-LE-NEXT:    movs r1, #0
 ; CHECK-LE-NEXT:    add.w r0, r12, r1
-; CHECK-LE-NEXT:    pop {r4, pc}
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: mul_bottom_user:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    cmp r0, #1
 ; CHECK-BE-NEXT:    blt .LBB1_4
 ; CHECK-BE-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-BE-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    subs r3, #2
 ; CHECK-BE-NEXT:    subs r2, #2
 ; CHECK-BE-NEXT:    mov.w r12, #0
@@ -154,14 +157,15 @@ define i32 @mul_bottom_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocaptur
 ; CHECK-BE-NEXT:    ldrsh.w r4, [r3, #2]
 ; CHECK-BE-NEXT:    smlabb r12, r5, r4, r12
 ; CHECK-BE-NEXT:    bne .LBB1_2
-; CHECK-BE-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-BE-NEXT:  @ %bb.3:
+; CHECK-BE-NEXT:    pop.w {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 ; CHECK-BE-NEXT:  .LBB1_4:
 ; CHECK-BE-NEXT:    mov.w r12, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %cmp24 = icmp sgt i32 %arg, 0
   br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
@@ -206,11 +210,11 @@ for.body:
 define i32 @mul_top_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture readonly %arg2, ptr nocapture readonly %arg3) {
 ; CHECK-LE-LABEL: mul_top_user:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, lr}
-; CHECK-LE-NEXT:    push {r4, lr}
 ; CHECK-LE-NEXT:    cmp r0, #1
 ; CHECK-LE-NEXT:    blt .LBB2_4
 ; CHECK-LE-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-LE-NEXT:    .save {r4, lr}
+; CHECK-LE-NEXT:    push {r4, lr}
 ; CHECK-LE-NEXT:    subs r3, #2
 ; CHECK-LE-NEXT:    subs r2, #2
 ; CHECK-LE-NEXT:    mov.w r12, #0
@@ -224,22 +228,23 @@ define i32 @mul_top_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture r
 ; CHECK-LE-NEXT:    asr.w r4, r4, #16
 ; CHECK-LE-NEXT:    mul r1, r4, r1
 ; CHECK-LE-NEXT:    bne .LBB2_2
-; CHECK-LE-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-LE-NEXT:  @ %bb.3:
+; CHECK-LE-NEXT:    pop.w {r4, lr}
 ; CHECK-LE-NEXT:    add.w r0, r12, r1
-; CHECK-LE-NEXT:    pop {r4, pc}
+; CHECK-LE-NEXT:    bx lr
 ; CHECK-LE-NEXT:  .LBB2_4:
 ; CHECK-LE-NEXT:    mov.w r12, #0
 ; CHECK-LE-NEXT:    movs r1, #0
 ; CHECK-LE-NEXT:    add.w r0, r12, r1
-; CHECK-LE-NEXT:    pop {r4, pc}
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: mul_top_user:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, lr}
-; CHECK-BE-NEXT:    push {r4, lr}
 ; CHECK-BE-NEXT:    cmp r0, #1
 ; CHECK-BE-NEXT:    blt .LBB2_4
 ; CHECK-BE-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-BE-NEXT:    .save {r4, lr}
+; CHECK-BE-NEXT:    push {r4, lr}
 ; CHECK-BE-NEXT:    subs r3, #2
 ; CHECK-BE-NEXT:    subs r2, #2
 ; CHECK-BE-NEXT:    mov.w r12, #0
@@ -255,14 +260,15 @@ define i32 @mul_top_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture r
 ; CHECK-BE-NEXT:    mul r1, r4, r1
 ; CHECK-BE-NEXT:    smlabb r12, r4, lr, r12
 ; CHECK-BE-NEXT:    bne .LBB2_2
-; CHECK-BE-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-BE-NEXT:  @ %bb.3:
+; CHECK-BE-NEXT:    pop.w {r4, lr}
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, pc}
+; CHECK-BE-NEXT:    bx lr
 ; CHECK-BE-NEXT:  .LBB2_4:
 ; CHECK-BE-NEXT:    mov.w r12, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %cmp24 = icmp sgt i32 %arg, 0
   br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
@@ -307,11 +313,11 @@ for.body:
 define i32 @and_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture readonly %arg2, ptr nocapture readonly %arg3) {
 ; CHECK-LE-LABEL: and_user:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, lr}
-; CHECK-LE-NEXT:    push {r4, lr}
 ; CHECK-LE-NEXT:    cmp r0, #1
 ; CHECK-LE-NEXT:    blt .LBB3_4
 ; CHECK-LE-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-LE-NEXT:    .save {r4, lr}
+; CHECK-LE-NEXT:    push {r4, lr}
 ; CHECK-LE-NEXT:    sub.w lr, r3, #2
 ; CHECK-LE-NEXT:    subs r2, #2
 ; CHECK-LE-NEXT:    mov.w r12, #0
@@ -325,22 +331,23 @@ define i32 @and_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture reado
 ; CHECK-LE-NEXT:    uxth r3, r3
 ; CHECK-LE-NEXT:    mul r1, r3, r1
 ; CHECK-LE-NEXT:    bne .LBB3_2
-; CHECK-LE-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-LE-NEXT:  @ %bb.3:
+; CHECK-LE-NEXT:    pop.w {r4, lr}
 ; CHECK-LE-NEXT:    add.w r0, r12, r1
-; CHECK-LE-NEXT:    pop {r4, pc}
+; CHECK-LE-NEXT:    bx lr
 ; CHECK-LE-NEXT:  .LBB3_4:
 ; CHECK-LE-NEXT:    mov.w r12, #0
 ; CHECK-LE-NEXT:    movs r1, #0
 ; CHECK-LE-NEXT:    add.w r0, r12, r1
-; CHECK-LE-NEXT:    pop {r4, pc}
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: and_user:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    cmp r0, #1
 ; CHECK-BE-NEXT:    blt .LBB3_4
 ; CHECK-BE-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-BE-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    subs r3, #2
 ; CHECK-BE-NEXT:    subs r2, #2
 ; CHECK-BE-NEXT:    mov.w r12, #0
@@ -356,14 +363,15 @@ define i32 @and_user(i32 %arg, ptr nocapture readnone %arg1, ptr nocapture reado
 ; CHECK-BE-NEXT:    ldrsh.w r4, [r3, #2]
 ; CHECK-BE-NEXT:    smlabb r12, r5, r4, r12
 ; CHECK-BE-NEXT:    bne .LBB3_2
-; CHECK-BE-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-BE-NEXT:  @ %bb.3:
+; CHECK-BE-NEXT:    pop.w {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 ; CHECK-BE-NEXT:  .LBB3_4:
 ; CHECK-BE-NEXT:    mov.w r12, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    add.w r0, r12, r1
-; CHECK-BE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %cmp24 = icmp sgt i32 %arg, 0
   br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/ARM/code-placement.ll b/llvm/test/CodeGen/ARM/code-placement.ll
index 7755ff53512ef..01d72f134aacb 100644
--- a/llvm/test/CodeGen/ARM/code-placement.ll
+++ b/llvm/test/CodeGen/ARM/code-placement.ll
@@ -11,7 +11,6 @@ entry:
   br i1 %0, label %bb2, label %bb
 
 bb:
-; CHECK: LBB0_1:
 ; CHECK: LBB0_[[LABEL:[0-9]]]:
 ; CHECK: bne LBB0_[[LABEL]]
 ; CHECK-NOT: b LBB0_[[LABEL]]

diff  --git a/llvm/test/CodeGen/ARM/mbp.ll b/llvm/test/CodeGen/ARM/mbp.ll
index e7ab3860b52ac..4f96029e06b95 100644
--- a/llvm/test/CodeGen/ARM/mbp.ll
+++ b/llvm/test/CodeGen/ARM/mbp.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s | FileCheck %s
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv7-unknown-linux-gnueabihf"
@@ -6,16 +7,50 @@ target triple = "thumbv7-unknown-linux-gnueabihf"
 %List = type { i32, ptr }
 
 ; The entry block should be the first block of the function.
-; CHECK-LABEL: foo
-; CHECK:       %entry
-; CHECK:       %for.body
-; CHECK:       %for.inc
-; CHECK:       %if.then
-; CHECK:       %for.cond.i
-; CHECK:       %for.body.i
-; CHECK:       %return
 
 define i1 @foo(ptr %ha, i32 %he) !prof !39 {
+; CHECK-LABEL: foo:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr r2, [r0]
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    itt eq
+; CHECK-NEXT:    moveq r0, #0
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB0_1: @ %for.body.preheader
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_2: @ %for.inc
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    popeq {r7, pc}
+; CHECK-NEXT:  .LBB0_3: @ %for.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB0_5 Depth 2
+; CHECK-NEXT:    ldr r0, [r2, #4]
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    beq .LBB0_2
+; CHECK-NEXT:  @ %bb.4: @ %if.then
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    ldrd r3, r0, [r0]
+; CHECK-NEXT:    sub.w r12, r0, #4
+; CHECK-NEXT:  .LBB0_5: @ %for.cond.i
+; CHECK-NEXT:    @ Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    blt .LBB0_2
+; CHECK-NEXT:  @ %bb.6: @ %for.body.i
+; CHECK-NEXT:    @ in Loop: Header=BB0_5 Depth=2
+; CHECK-NEXT:    ldr.w lr, [r12, r3, lsl #2]
+; CHECK-NEXT:    subs r3, #1
+; CHECK-NEXT:    movs r0, #1
+; CHECK-NEXT:    cmp lr, r1
+; CHECK-NEXT:    bne .LBB0_5
+; CHECK-NEXT:  @ %bb.7:
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %TargetPtr = load ptr, ptr %ha, align 4
   %cmp1 = icmp eq ptr %TargetPtr, null

diff  --git a/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll b/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
index 2755d354a6244..c9724674afd82 100644
--- a/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
+++ b/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
@@ -6,11 +6,11 @@
 define void @ssat_unroll(ptr %pSrcA, ptr %pSrcB, ptr %pDst, i32 %blockSize) {
 ; CHECK-LABEL: ssat_unroll:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB0_1: @ %while.body.preheader
 ; CHECK-NEXT:    .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB0_5
-; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
 ; CHECK-NEXT:    sub r12, r3, #1
 ; CHECK-NEXT:    tst r3, #1
 ; CHECK-NEXT:    beq .LBB0_3
@@ -23,7 +23,7 @@ define void @ssat_unroll(ptr %pSrcA, ptr %pSrcB, ptr %pDst, i32 %blockSize) {
 ; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:  .LBB0_3: @ %while.body.prol.loopexit
 ; CHECK-NEXT:    cmp r12, #0
-; CHECK-NEXT:    popeq {r11, pc}
+; CHECK-NEXT:    beq .LBB0_5
 ; CHECK-NEXT:  .LBB0_4: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh r12, [r0]
@@ -41,8 +41,9 @@ define void @ssat_unroll(ptr %pSrcA, ptr %pSrcB, ptr %pDst, i32 %blockSize) {
 ; CHECK-NEXT:    strh r12, [r2, #2]
 ; CHECK-NEXT:    add r2, r2, #4
 ; CHECK-NEXT:    bne .LBB0_4
-; CHECK-NEXT:  .LBB0_5: @ %while.end
-; CHECK-NEXT:    pop {r11, pc}
+; CHECK-NEXT:  .LBB0_5:
+; CHECK-NEXT:    pop {r11, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp.not7 = icmp eq i32 %blockSize, 0
   br i1 %cmp.not7, label %while.end, label %while.body.preheader
@@ -125,11 +126,11 @@ while.end:                                        ; preds = %while.body, %while.
 define void @ssat_unroll_minmax(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture writeonly %pDst, i32 %blockSize) {
 ; CHECK-LABEL: ssat_unroll_minmax:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB1_1: @ %while.body.preheader
 ; CHECK-NEXT:    .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB1_5
-; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
 ; CHECK-NEXT:    sub r12, r3, #1
 ; CHECK-NEXT:    tst r3, #1
 ; CHECK-NEXT:    beq .LBB1_3
@@ -142,7 +143,7 @@ define void @ssat_unroll_minmax(ptr nocapture readonly %pSrcA, ptr nocapture rea
 ; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:  .LBB1_3: @ %while.body.prol.loopexit
 ; CHECK-NEXT:    cmp r12, #0
-; CHECK-NEXT:    popeq {r11, pc}
+; CHECK-NEXT:    beq .LBB1_5
 ; CHECK-NEXT:  .LBB1_4: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh r12, [r0]
@@ -160,8 +161,9 @@ define void @ssat_unroll_minmax(ptr nocapture readonly %pSrcA, ptr nocapture rea
 ; CHECK-NEXT:    strh r12, [r2, #2]
 ; CHECK-NEXT:    add r2, r2, #4
 ; CHECK-NEXT:    bne .LBB1_4
-; CHECK-NEXT:  .LBB1_5: @ %while.end
-; CHECK-NEXT:    pop {r11, pc}
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    pop {r11, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp.not7 = icmp eq i32 %blockSize, 0
   br i1 %cmp.not7, label %while.end, label %while.body.preheader

diff  --git a/llvm/test/CodeGen/PowerPC/common-chain-aix32.ll b/llvm/test/CodeGen/PowerPC/common-chain-aix32.ll
index 0cf7119eab84c..35ddcfd9ba6d6 100644
--- a/llvm/test/CodeGen/PowerPC/common-chain-aix32.ll
+++ b/llvm/test/CodeGen/PowerPC/common-chain-aix32.ll
@@ -39,19 +39,19 @@ define i64 @two_chain_same_offset_succ_i32(ptr %p, i32 %offset, i32 %base1, i64
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cmplwi r6, 0
 ; CHECK-NEXT:    cmpwi cr1, r6, 0
-; CHECK-NEXT:    stw r30, -8(r1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw r31, -4(r1) # 4-byte Folded Spill
 ; CHECK-NEXT:    crandc 4*cr5+lt, 4*cr1+lt, eq
 ; CHECK-NEXT:    cmpwi cr1, r7, 0
-; CHECK-NEXT:    bc 12, 4*cr5+lt, L..BB0_5
+; CHECK-NEXT:    bc 12, 4*cr5+lt, L..BB0_6
 ; CHECK-NEXT:  # %bb.1: # %entry
 ; CHECK-NEXT:    crand 4*cr5+lt, eq, 4*cr1+eq
-; CHECK-NEXT:    bc 12, 4*cr5+lt, L..BB0_5
+; CHECK-NEXT:    bc 12, 4*cr5+lt, L..BB0_6
 ; CHECK-NEXT:  # %bb.2: # %for.body.preheader
 ; CHECK-NEXT:    slwi r8, r4, 1
 ; CHECK-NEXT:    li r10, 0
 ; CHECK-NEXT:    li r11, 0
+; CHECK-NEXT:    stw r30, -8(r1) # 4-byte Folded Spill
 ; CHECK-NEXT:    add r8, r4, r8
+; CHECK-NEXT:    stw r31, -4(r1) # 4-byte Folded Spill
 ; CHECK-NEXT:    add r9, r5, r8
 ; CHECK-NEXT:    add r5, r5, r4
 ; CHECK-NEXT:    add r8, r3, r5
@@ -83,15 +83,15 @@ define i64 @two_chain_same_offset_succ_i32(ptr %p, i32 %offset, i32 %base1, i64
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    crand 4*cr5+lt, eq, 4*cr1+lt
 ; CHECK-NEXT:    bc 12, 4*cr5+lt, L..BB0_3
-; CHECK-NEXT:    b L..BB0_6
-; CHECK-NEXT:  L..BB0_5:
-; CHECK-NEXT:    li r3, 0
-; CHECK-NEXT:    li r5, 0
-; CHECK-NEXT:  L..BB0_6: # %for.cond.cleanup
+; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    lwz r31, -4(r1) # 4-byte Folded Reload
 ; CHECK-NEXT:    lwz r30, -8(r1) # 4-byte Folded Reload
 ; CHECK-NEXT:    mr r4, r5
 ; CHECK-NEXT:    blr
+; CHECK-NEXT:  L..BB0_6:
+; CHECK-NEXT:    li r3, 0
+; CHECK-NEXT:    li r4, 0
+; CHECK-NEXT:    blr
 entry:
   %add = add nsw i32 %base1, %offset
   %mul = shl nsw i32 %offset, 1

diff  --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll
index ea8a72e7d11e1..5f8c21e30f8fd 100644
--- a/llvm/test/CodeGen/PowerPC/common-chain.ll
+++ b/llvm/test/CodeGen/PowerPC/common-chain.ll
@@ -137,14 +137,14 @@ define i64 @not_perfect_chain_all_same_offset_fail(ptr %p, i64 %offset, i64 %bas
 ; CHECK-LABEL: not_perfect_chain_all_same_offset_fail:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cmpdi r6, 0
-; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    ble cr0, .LBB1_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    sldi r7, r4, 1
-; CHECK-NEXT:    sldi r9, r4, 2
 ; CHECK-NEXT:    add r5, r3, r5
 ; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    add r8, r4, r7
+; CHECK-NEXT:    sldi r9, r4, 2
 ; CHECK-NEXT:    mtctr r6
 ; CHECK-NEXT:    add r10, r4, r9
 ; CHECK-NEXT:    .p2align 4
@@ -161,12 +161,11 @@ define i64 @not_perfect_chain_all_same_offset_fail(ptr %p, i64 %offset, i64 %bas
 ; CHECK-NEXT:    mulld r6, r6, r0
 ; CHECK-NEXT:    maddld r3, r6, r30, r3
 ; CHECK-NEXT:    bdnz .LBB1_2
-; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
+; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    blr
 ; CHECK-NEXT:  .LBB1_4:
 ; CHECK-NEXT:    li r3, 0
-; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    blr
 entry:
   %mul = shl nsw i64 %offset, 1
@@ -425,20 +424,20 @@ define i64 @not_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
 ; CHECK-LABEL: not_same_offset_fail:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cmpdi r6, 0
+; CHECK-NEXT:    ble cr0, .LBB4_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    add r5, r3, r5
+; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    ble cr0, .LBB4_3
-; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    mtctr r6
 ; CHECK-NEXT:    mulli r11, r4, 10
 ; CHECK-NEXT:    sldi r8, r4, 2
-; CHECK-NEXT:    add r5, r3, r5
-; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    add r8, r4, r8
 ; CHECK-NEXT:    sldi r9, r4, 3
-; CHECK-NEXT:    mtctr r6
-; CHECK-NEXT:    sldi r7, r4, 1
 ; CHECK-NEXT:    sub r10, r9, r4
+; CHECK-NEXT:    sldi r7, r4, 1
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB4_2: # %for.body
 ; CHECK-NEXT:    #
@@ -455,14 +454,14 @@ define i64 @not_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
 ; CHECK-NEXT:    mulld r6, r6, r29
 ; CHECK-NEXT:    maddld r3, r6, r28, r3
 ; CHECK-NEXT:    bdnz .LBB4_2
-; CHECK-NEXT:    b .LBB4_4
-; CHECK-NEXT:  .LBB4_3:
-; CHECK-NEXT:    li r3, 0
-; CHECK-NEXT:  .LBB4_4: # %for.cond.cleanup
+; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB4_4:
+; CHECK-NEXT:    li r3, 0
+; CHECK-NEXT:    blr
 entry:
   %mul = shl nsw i64 %offset, 1
   %mul2 = mul nsw i64 %offset, 5

diff  --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
index 769b358131e9a..37baef6043884 100644
--- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
@@ -192,21 +192,21 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
 ; CHECK-LABEL: test_max_number_reminder:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    cmplwi r4, 0
-; CHECK-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    beq cr0, .LBB2_3
+; CHECK-NEXT:    beq cr0, .LBB2_4
 ; CHECK-NEXT:  # %bb.1: # %bb3.preheader
 ; CHECK-NEXT:    cmpldi r4, 1
 ; CHECK-NEXT:    li r5, 1
 ; CHECK-NEXT:    addi r9, r3, 4002
+; CHECK-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    li r6, -1
+; CHECK-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    li r7, 3
 ; CHECK-NEXT:    li r8, 5
 ; CHECK-NEXT:    li r10, 9
+; CHECK-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    iselgt r3, r4, r5
 ; CHECK-NEXT:    mtctr r3
 ; CHECK-NEXT:    li r3, 0
@@ -232,10 +232,7 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    mulld r11, r11, r26
 ; CHECK-NEXT:    maddld r3, r11, r25, r3
 ; CHECK-NEXT:    bdnz .LBB2_2
-; CHECK-NEXT:    b .LBB2_4
-; CHECK-NEXT:  .LBB2_3:
-; CHECK-NEXT:    li r3, 0
-; CHECK-NEXT:  .LBB2_4: # %bb45
+; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
@@ -244,6 +241,9 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    addi r3, r4, 0
+; CHECK-NEXT:    blr
 bb:
   %i = sext i32 %arg1 to i64
   %i2 = icmp eq i32 %arg1, 0
@@ -475,11 +475,11 @@ define dso_local i64 @test_ds_multiple_chains(ptr %arg, ptr %arg1, i32 signext %
 ; CHECK-LABEL: test_ds_multiple_chains:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    cmplwi r5, 0
-; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    beq cr0, .LBB5_3
+; CHECK-NEXT:    beq cr0, .LBB5_4
 ; CHECK-NEXT:  # %bb.1: # %bb4.preheader
 ; CHECK-NEXT:    cmpldi r5, 1
 ; CHECK-NEXT:    li r6, 1
+; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    addi r3, r3, 4001
 ; CHECK-NEXT:    addi r4, r4, 4001
 ; CHECK-NEXT:    li r7, 9
@@ -507,13 +507,13 @@ define dso_local i64 @test_ds_multiple_chains(ptr %arg, ptr %arg1, i32 signext %
 ; CHECK-NEXT:    mulld r8, r8, r30
 ; CHECK-NEXT:    maddld r6, r8, r9, r6
 ; CHECK-NEXT:    bdnz .LBB5_2
-; CHECK-NEXT:    b .LBB5_4
-; CHECK-NEXT:  .LBB5_3:
-; CHECK-NEXT:    li r6, 0
-; CHECK-NEXT:  .LBB5_4: # %bb43
+; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    add r3, r6, r5
 ; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB5_4:
+; CHECK-NEXT:    addi r3, r5, 0
+; CHECK-NEXT:    blr
 bb:
   %i = sext i32 %arg2 to i64
   %i3 = icmp eq i32 %arg2, 0
@@ -595,17 +595,17 @@ define i64 @test_ds_cross_basic_blocks(ptr %arg, i32 signext %arg1) {
 ; CHECK-LABEL: test_ds_cross_basic_blocks:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    cmplwi r4, 0
-; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    beq cr0, .LBB6_8
+; CHECK-NEXT:    beq cr0, .LBB6_9
 ; CHECK-NEXT:  # %bb.1: # %bb3
 ; CHECK-NEXT:    addis r5, r2, .LC0 at toc@ha
 ; CHECK-NEXT:    cmpldi r4, 1
 ; CHECK-NEXT:    li r7, 1
 ; CHECK-NEXT:    addi r6, r3, 4009
+; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    ld r5, .LC0 at toc@l(r5)
 ; CHECK-NEXT:    iselgt r3, r4, r7
+; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    li r4, -7
 ; CHECK-NEXT:    li r8, -6
 ; CHECK-NEXT:    li r9, 1
@@ -634,7 +634,7 @@ define i64 @test_ds_cross_basic_blocks(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    mulld r0, r0, r10
 ; CHECK-NEXT:    mulld r0, r0, r9
 ; CHECK-NEXT:    maddld r3, r0, r7, r3
-; CHECK-NEXT:    bdz .LBB6_9
+; CHECK-NEXT:    bdz .LBB6_8
 ; CHECK-NEXT:  .LBB6_4: # %bb5
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lbzu r0, 1(r5)
@@ -666,12 +666,13 @@ define i64 @test_ds_cross_basic_blocks(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    add r7, r0, r7
 ; CHECK-NEXT:    b .LBB6_3
 ; CHECK-NEXT:  .LBB6_8:
-; CHECK-NEXT:    li r3, 0
-; CHECK-NEXT:  .LBB6_9: # %bb64
 ; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB6_9:
+; CHECK-NEXT:    li r3, 0
+; CHECK-NEXT:    blr
 bb:
   %i = sext i32 %arg1 to i64
   %i2 = icmp eq i32 %arg1, 0

diff  --git a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll
index b91f20b710a2d..79f2ef3e3746a 100644
--- a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll
+++ b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll
@@ -6,24 +6,24 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpd 5, 7
-; CHECK-NEXT:    std 22, -80(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 23, -72(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 24, -64(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 25, -56(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 26, -48(1) # 8-byte Folded Spill
+; CHECK-NEXT:    bgelr 0
+; CHECK-NEXT:  # %bb.1: # %.preheader
 ; CHECK-NEXT:    std 27, -40(1) # 8-byte Folded Spill
+; CHECK-NEXT:    addi 27, 5, 2
 ; CHECK-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; CHECK-NEXT:    addi 28, 5, 3
 ; CHECK-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; CHECK-NEXT:    bge 0, .LBB0_6
-; CHECK-NEXT:  # %bb.1: # %.preheader
 ; CHECK-NEXT:    addi 30, 5, 1
-; CHECK-NEXT:    addi 28, 5, 3
-; CHECK-NEXT:    addi 27, 5, 2
 ; CHECK-NEXT:    mulld 12, 8, 5
-; CHECK-NEXT:    addi 29, 3, 16
 ; CHECK-NEXT:    mulld 0, 9, 8
+; CHECK-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; CHECK-NEXT:    addi 29, 3, 16
 ; CHECK-NEXT:    sldi 11, 10, 3
+; CHECK-NEXT:    std 22, -80(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 23, -72(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 24, -64(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 25, -56(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 26, -48(1) # 8-byte Folded Spill
 ; CHECK-NEXT:    mulld 30, 8, 30
 ; CHECK-NEXT:    mulld 28, 8, 28
 ; CHECK-NEXT:    mulld 8, 8, 27

diff  --git a/llvm/test/CodeGen/PowerPC/shrink-wrap.ll b/llvm/test/CodeGen/PowerPC/shrink-wrap.ll
index 08c391e34c6f4..12d0b056ca886 100644
--- a/llvm/test/CodeGen/PowerPC/shrink-wrap.ll
+++ b/llvm/test/CodeGen/PowerPC/shrink-wrap.ll
@@ -7,6 +7,9 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC64-LABEL: shrinkwrapme:
 ; POWERPC64:       # %bb.0: # %entry
 ; POWERPC64-NEXT:    cmpwi 4, 0
+; POWERPC64-NEXT:    ble 0, .LBB0_4
+; POWERPC64-NEXT:  # %bb.1: # %for.body.preheader
+; POWERPC64-NEXT:    addi 4, 4, -1
 ; POWERPC64-NEXT:    std 14, -144(1) # 8-byte Folded Spill
 ; POWERPC64-NEXT:    std 15, -136(1) # 8-byte Folded Spill
 ; POWERPC64-NEXT:    std 16, -128(1) # 8-byte Folded Spill
@@ -22,14 +25,11 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC64-NEXT:    std 26, -48(1) # 8-byte Folded Spill
 ; POWERPC64-NEXT:    std 27, -40(1) # 8-byte Folded Spill
 ; POWERPC64-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; POWERPC64-NEXT:    clrldi 4, 4, 32
+; POWERPC64-NEXT:    addi 4, 4, 1
 ; POWERPC64-NEXT:    std 29, -24(1) # 8-byte Folded Spill
 ; POWERPC64-NEXT:    std 30, -16(1) # 8-byte Folded Spill
 ; POWERPC64-NEXT:    std 31, -8(1) # 8-byte Folded Spill
-; POWERPC64-NEXT:    ble 0, .LBB0_3
-; POWERPC64-NEXT:  # %bb.1: # %for.body.preheader
-; POWERPC64-NEXT:    addi 4, 4, -1
-; POWERPC64-NEXT:    clrldi 4, 4, 32
-; POWERPC64-NEXT:    addi 4, 4, 1
 ; POWERPC64-NEXT:    mtctr 4
 ; POWERPC64-NEXT:    li 4, 0
 ; POWERPC64-NEXT:    .p2align 4
@@ -39,10 +39,7 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC64-NEXT:    add 4, 3, 4
 ; POWERPC64-NEXT:    #NO_APP
 ; POWERPC64-NEXT:    bdnz .LBB0_2
-; POWERPC64-NEXT:    b .LBB0_4
-; POWERPC64-NEXT:  .LBB0_3:
-; POWERPC64-NEXT:    li 4, 0
-; POWERPC64-NEXT:  .LBB0_4: # %for.cond.cleanup
+; POWERPC64-NEXT:  # %bb.3:
 ; POWERPC64-NEXT:    ld 31, -8(1) # 8-byte Folded Reload
 ; POWERPC64-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
 ; POWERPC64-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
@@ -63,10 +60,16 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC64-NEXT:    ld 15, -136(1) # 8-byte Folded Reload
 ; POWERPC64-NEXT:    ld 14, -144(1) # 8-byte Folded Reload
 ; POWERPC64-NEXT:    blr
+; POWERPC64-NEXT:  .LBB0_4:
+; POWERPC64-NEXT:    li 4, 0
+; POWERPC64-NEXT:    extsw 3, 4
+; POWERPC64-NEXT:    blr
 ;
 ; POWERPC32-AIX-LABEL: shrinkwrapme:
 ; POWERPC32-AIX:       # %bb.0: # %entry
 ; POWERPC32-AIX-NEXT:    cmpwi 4, 0
+; POWERPC32-AIX-NEXT:    ble 0, L..BB0_4
+; POWERPC32-AIX-NEXT:  # %bb.1: # %for.body.preheader
 ; POWERPC32-AIX-NEXT:    stw 14, -72(1) # 4-byte Folded Spill
 ; POWERPC32-AIX-NEXT:    stw 15, -68(1) # 4-byte Folded Spill
 ; POWERPC32-AIX-NEXT:    stw 16, -64(1) # 4-byte Folded Spill
@@ -85,8 +88,6 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC32-AIX-NEXT:    stw 29, -12(1) # 4-byte Folded Spill
 ; POWERPC32-AIX-NEXT:    stw 30, -8(1) # 4-byte Folded Spill
 ; POWERPC32-AIX-NEXT:    stw 31, -4(1) # 4-byte Folded Spill
-; POWERPC32-AIX-NEXT:    ble 0, L..BB0_3
-; POWERPC32-AIX-NEXT:  # %bb.1: # %for.body.preheader
 ; POWERPC32-AIX-NEXT:    mtctr 4
 ; POWERPC32-AIX-NEXT:    li 4, 0
 ; POWERPC32-AIX-NEXT:    .align 4
@@ -96,10 +97,7 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC32-AIX-NEXT:    add 4, 3, 4
 ; POWERPC32-AIX-NEXT:    #NO_APP
 ; POWERPC32-AIX-NEXT:    bdnz L..BB0_2
-; POWERPC32-AIX-NEXT:    b L..BB0_4
-; POWERPC32-AIX-NEXT:  L..BB0_3:
-; POWERPC32-AIX-NEXT:    li 4, 0
-; POWERPC32-AIX-NEXT:  L..BB0_4: # %for.cond.cleanup
+; POWERPC32-AIX-NEXT:  # %bb.3:
 ; POWERPC32-AIX-NEXT:    lwz 31, -4(1) # 4-byte Folded Reload
 ; POWERPC32-AIX-NEXT:    lwz 30, -8(1) # 4-byte Folded Reload
 ; POWERPC32-AIX-NEXT:    lwz 29, -12(1) # 4-byte Folded Reload
@@ -120,10 +118,16 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC32-AIX-NEXT:    lwz 15, -68(1) # 4-byte Folded Reload
 ; POWERPC32-AIX-NEXT:    lwz 14, -72(1) # 4-byte Folded Reload
 ; POWERPC32-AIX-NEXT:    blr
+; POWERPC32-AIX-NEXT:  L..BB0_4:
+; POWERPC32-AIX-NEXT:    li 3, 0
+; POWERPC32-AIX-NEXT:    blr
 ;
 ; POWERPC64-AIX-LABEL: shrinkwrapme:
 ; POWERPC64-AIX:       # %bb.0: # %entry
 ; POWERPC64-AIX-NEXT:    cmpwi 4, 1
+; POWERPC64-AIX-NEXT:    blt 0, L..BB0_4
+; POWERPC64-AIX-NEXT:  # %bb.1: # %for.body.preheader
+; POWERPC64-AIX-NEXT:    addi 4, 4, -1
 ; POWERPC64-AIX-NEXT:    std 14, -144(1) # 8-byte Folded Spill
 ; POWERPC64-AIX-NEXT:    std 15, -136(1) # 8-byte Folded Spill
 ; POWERPC64-AIX-NEXT:    std 16, -128(1) # 8-byte Folded Spill
@@ -139,14 +143,11 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC64-AIX-NEXT:    std 26, -48(1) # 8-byte Folded Spill
 ; POWERPC64-AIX-NEXT:    std 27, -40(1) # 8-byte Folded Spill
 ; POWERPC64-AIX-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; POWERPC64-AIX-NEXT:    clrldi 4, 4, 32
+; POWERPC64-AIX-NEXT:    addi 4, 4, 1
 ; POWERPC64-AIX-NEXT:    std 29, -24(1) # 8-byte Folded Spill
 ; POWERPC64-AIX-NEXT:    std 30, -16(1) # 8-byte Folded Spill
 ; POWERPC64-AIX-NEXT:    std 31, -8(1) # 8-byte Folded Spill
-; POWERPC64-AIX-NEXT:    blt 0, L..BB0_3
-; POWERPC64-AIX-NEXT:  # %bb.1: # %for.body.preheader
-; POWERPC64-AIX-NEXT:    addi 4, 4, -1
-; POWERPC64-AIX-NEXT:    clrldi 4, 4, 32
-; POWERPC64-AIX-NEXT:    addi 4, 4, 1
 ; POWERPC64-AIX-NEXT:    mtctr 4
 ; POWERPC64-AIX-NEXT:    li 4, 0
 ; POWERPC64-AIX-NEXT:    .align 4
@@ -156,10 +157,7 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC64-AIX-NEXT:    add 4, 3, 4
 ; POWERPC64-AIX-NEXT:    #NO_APP
 ; POWERPC64-AIX-NEXT:    bdnz L..BB0_2
-; POWERPC64-AIX-NEXT:    b L..BB0_4
-; POWERPC64-AIX-NEXT:  L..BB0_3:
-; POWERPC64-AIX-NEXT:    li 4, 0
-; POWERPC64-AIX-NEXT:  L..BB0_4: # %for.cond.cleanup
+; POWERPC64-AIX-NEXT:  # %bb.3:
 ; POWERPC64-AIX-NEXT:    ld 31, -8(1) # 8-byte Folded Reload
 ; POWERPC64-AIX-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
 ; POWERPC64-AIX-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
@@ -180,6 +178,10 @@ define signext i32 @shrinkwrapme(i32 signext %a, i32 signext %lim) {
 ; POWERPC64-AIX-NEXT:    ld 15, -136(1) # 8-byte Folded Reload
 ; POWERPC64-AIX-NEXT:    ld 14, -144(1) # 8-byte Folded Reload
 ; POWERPC64-AIX-NEXT:    blr
+; POWERPC64-AIX-NEXT:  L..BB0_4:
+; POWERPC64-AIX-NEXT:    li 4, 0
+; POWERPC64-AIX-NEXT:    extsw 3, 4
+; POWERPC64-AIX-NEXT:    blr
 entry:
   %cmp5 = icmp sgt i32 %lim, 0
   br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/PowerPC/shrink-wrap.mir b/llvm/test/CodeGen/PowerPC/shrink-wrap.mir
index 1b6ccb92527e7..561b193086bf5 100644
--- a/llvm/test/CodeGen/PowerPC/shrink-wrap.mir
+++ b/llvm/test/CodeGen/PowerPC/shrink-wrap.mir
@@ -48,42 +48,7 @@
 ...
 ---
 name:            shrinkwrapme
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
 tracksRegLiveness: true
-hasWinCFI:       false
-registers:       []
-liveins:
-  - { reg: '$x3', virtual-reg: '' }
-  - { reg: '$x4', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-callSites:       []
-constants:       []
-machineFunctionInfo: {}
 body:             |
   ; CHECK-LABEL: name: shrinkwrapme
   ; CHECK: bb.0.entry:
@@ -117,11 +82,17 @@ body:             |
   ; CHECK-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4.for.body:
-  ; CHECK-NEXT:   successors: %bb.4(0x7c000000), %bb.3(0x04000000)
+  ; CHECK-NEXT:   successors: %bb.4(0x7c000000), %bb.5(0x04000000)
   ; CHECK-NEXT:   liveins: $r4, $x3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   INLINEASM &"add $0, $1, $2", 0 /* attdialect */, 131082 /* regdef:GPRC */, def renamable $r4, 131081 /* reguse:GPRC */, renamable $r3, 131081 /* reguse:GPRC */, killed renamable $r4, 12 /* clobber */, implicit-def dead early-clobber $r14, 12 /* clobber */, implicit-def dead early-clobber $r15, 12 /* clobber */, implicit-def dead early-clobber $r16, 12 /* clobber */, implicit-def dead early-clobber $r17, 12 /* clobber */, implicit-def dead early-clobber $r18, 12 /* clobber */, implicit-def dead early-clobber $r19, 12 /* clobber */, implicit-def dead early-clobber $r20, 12 /* clobber */, implicit-def dead early-clobber $r21, 12 /* clobber */, implicit-def dead early-clobber $r22, 12 /* clobber */, implicit-def dead early-clobber $r23, 12 /* clobber */, implicit-def dead early-clobber $r24, 12 /* clobber */, implicit-def dead early-clobber $r25, 12 /* clobber */, implicit-def dead early-clobber $r26, 12 /* clobber */, implicit-def dead early-clobber $r27, 12 /* clobber */, implicit-def dead early-clobber $r28, 12 /* clobber */, implicit-def dead early-clobber $r29, 12 /* clobber */, implicit-def dead early-clobber $r30, 12 /* clobber */, implicit-def dead early-clobber $r31
   ; CHECK-NEXT:   BDNZ8 %bb.4, implicit-def dead $ctr8, implicit $ctr8
+  ; CHECK-NEXT:   B %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $r4
+  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   B %bb.3
   bb.0.entry:
     successors: %bb.2(0x50000000), %bb.1(0x30000000)

diff  --git a/llvm/test/CodeGen/RISCV/aext-to-sext.ll b/llvm/test/CodeGen/RISCV/aext-to-sext.ll
index 806c495fa6777..0aa04f40f6a52 100644
--- a/llvm/test/CodeGen/RISCV/aext-to-sext.ll
+++ b/llvm/test/CodeGen/RISCV/aext-to-sext.ll
@@ -11,21 +11,22 @@
 define void @quux(i32 signext %arg, i32 signext %arg1) nounwind {
 ; RV64I-LABEL: quux:
 ; RV64I:       # %bb.0: # %bb
+; RV64I-NEXT:    beq a0, a1, .LBB0_4
+; RV64I-NEXT:  # %bb.1: # %bb2.preheader
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    beq a0, a1, .LBB0_3
-; RV64I-NEXT:  # %bb.1: # %bb2.preheader
 ; RV64I-NEXT:    subw s0, a1, a0
 ; RV64I-NEXT:  .LBB0_2: # %bb2
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    call hoge at plt
 ; RV64I-NEXT:    addiw s0, s0, -1
 ; RV64I-NEXT:    bnez s0, .LBB0_2
-; RV64I-NEXT:  .LBB0_3: # %bb6
+; RV64I-NEXT:  # %bb.3:
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:  .LBB0_4: # %bb6
 ; RV64I-NEXT:    ret
 bb:
   %tmp = icmp eq i32 %arg, %arg1

diff  --git a/llvm/test/CodeGen/RISCV/fli-licm.ll b/llvm/test/CodeGen/RISCV/fli-licm.ll
index 93bb934c1cb0d..f37ace801b159 100644
--- a/llvm/test/CodeGen/RISCV/fli-licm.ll
+++ b/llvm/test/CodeGen/RISCV/fli-licm.ll
@@ -12,11 +12,11 @@
 define void @process_nodes(ptr %0) nounwind {
 ; RV32-LABEL: process_nodes:
 ; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    beqz a0, .LBB0_4
+; RV32-NEXT:  # %bb.1: # %loop.preheader
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    beqz a0, .LBB0_3
-; RV32-NEXT:  # %bb.1: # %loop.preheader
 ; RV32-NEXT:    mv s0, a0
 ; RV32-NEXT:  .LBB0_2: # %loop
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -25,19 +25,20 @@ define void @process_nodes(ptr %0) nounwind {
 ; RV32-NEXT:    call do_it at plt
 ; RV32-NEXT:    lw s0, 0(s0)
 ; RV32-NEXT:    bnez s0, .LBB0_2
-; RV32-NEXT:  .LBB0_3: # %exit
+; RV32-NEXT:  # %bb.3:
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:  .LBB0_4: # %exit
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: process_nodes:
 ; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    beqz a0, .LBB0_4
+; RV64-NEXT:  # %bb.1: # %loop.preheader
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64-NEXT:    beqz a0, .LBB0_3
-; RV64-NEXT:  # %bb.1: # %loop.preheader
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:  .LBB0_2: # %loop
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -46,10 +47,11 @@ define void @process_nodes(ptr %0) nounwind {
 ; RV64-NEXT:    call do_it at plt
 ; RV64-NEXT:    ld s0, 0(s0)
 ; RV64-NEXT:    bnez s0, .LBB0_2
-; RV64-NEXT:  .LBB0_3: # %exit
+; RV64-NEXT:  # %bb.3:
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:  .LBB0_4: # %exit
 ; RV64-NEXT:    ret
 entry:
   %1 = icmp eq ptr %0, null

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll
index d67e66d7a7131..421b5b5364d35 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll
@@ -4,11 +4,13 @@
 define i32 @test(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
 ; CHECK-LABEL: test:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB0_1: @ %for.body.preheader
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB0_4
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    mov lr, r0
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:  .LBB0_2: @ %for.body
@@ -21,10 +23,7 @@ define i32 @test(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
 ; CHECK-NEXT:    @NO_APP
 ; CHECK-NEXT:    add r0, r3
 ; CHECK-NEXT:    bne .LBB0_2
-; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r7, pc}
-; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp9 = icmp sgt i32 %n, 0
@@ -51,11 +50,13 @@ for.body:                                         ; preds = %entry, %for.body
 define i32 @testlr(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
 ; CHECK-LABEL: testlr:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB1_1: @ %for.body.preheader
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB1_4
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:  .LBB1_2: @ %for.body
@@ -68,10 +69,7 @@ define i32 @testlr(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    @NO_APP
 ; CHECK-NEXT:    add r0, r4
 ; CHECK-NEXT:    bne .LBB1_2
-; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, pc}
-; CHECK-NEXT:  .LBB1_4:
-; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %cmp9 = icmp sgt i32 %n, 0

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
index 99d169e63e5a5..59b32a3f441c1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -4,11 +4,12 @@
 define void @test_memcpy(ptr nocapture %x, ptr nocapture readonly %y, i32 %n, i32 %m) {
 ; CHECK-LABEL: test_memcpy:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB0_1: @ %for.body.preheader
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB0_5
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    lsl.w r12, r3, #2
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    b .LBB0_2
@@ -31,8 +32,9 @@ define void @test_memcpy(ptr nocapture %x, ptr nocapture readonly %y, i32 %n, i3
 ; CHECK-NEXT:    vstrb.8 q0, [r5], #16
 ; CHECK-NEXT:    letp lr, .LBB0_4
 ; CHECK-NEXT:    b .LBB0_3
-; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:  .LBB0_5:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp8 = icmp sgt i32 %n, 0
   br i1 %cmp8, label %for.body, label %for.cond.cleanup
@@ -55,12 +57,12 @@ for.body:                                         ; preds = %entry, %for.body
 define void @test_memset(ptr nocapture %x, i32 %n, i32 %m) {
 ; CHECK-LABEL: test_memset:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    cmp r1, #1
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    poplt {r7, pc}
+; CHECK-NEXT:    bxlt lr
 ; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    b .LBB1_2
 ; CHECK-NEXT:  .LBB1_2: @ %for.body
@@ -80,8 +82,9 @@ define void @test_memset(ptr nocapture %x, i32 %n, i32 %m) {
 ; CHECK-NEXT:    vstrb.8 q0, [r12], #16
 ; CHECK-NEXT:    letp lr, .LBB1_4
 ; CHECK-NEXT:    b .LBB1_3
-; CHECK-NEXT:  .LBB1_5: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    pop.w {r7, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp5 = icmp sgt i32 %n, 0
   br i1 %cmp5, label %for.body, label %for.cond.cleanup
@@ -102,13 +105,14 @@ for.body:                                         ; preds = %entry, %for.body
 define void @test_memmove(ptr nocapture %x, ptr nocapture readonly %y, i32 %n, i32 %m) {
 ; CHECK-LABEL: test_memmove:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB2_1: @ %for.body.preheader
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB2_3
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    mov r8, r3
 ; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    mov r9, r1
@@ -124,9 +128,10 @@ define void @test_memmove(ptr nocapture %x, ptr nocapture readonly %y, i32 %n, i
 ; CHECK-NEXT:    add r6, r4
 ; CHECK-NEXT:    subs r5, #1
 ; CHECK-NEXT:    bne .LBB2_2
-; CHECK-NEXT:  .LBB2_3: @ %for.cond.cleanup
+; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp8 = icmp sgt i32 %n, 0
   br i1 %cmp8, label %for.body, label %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index 13e39a8f16e33..23eb5900bb7d1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -4,10 +4,11 @@
 define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_float_mul:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB0_10
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB0_1: @ %for.body.preheader
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB0_3
 ; CHECK-NEXT:  @ %bb.2:
@@ -80,8 +81,9 @@ define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr noca
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r5, #12]
 ; CHECK-NEXT:    bne .LBB0_9
-; CHECK-NEXT:  .LBB0_10: @ %for.cond.cleanup
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:  .LBB0_10:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:  .LBB0_11: @ %vector.ph
 ; CHECK-NEXT:    bic r12, r3, #3
 ; CHECK-NEXT:    movs r6, #1
@@ -215,10 +217,11 @@ for.body:                                         ; preds = %for.body.prol.loope
 define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_float_add:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB1_10
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB1_1: @ %for.body.preheader
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB1_3
 ; CHECK-NEXT:  @ %bb.2:
@@ -291,8 +294,9 @@ define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr noca
 ; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r5, #12]
 ; CHECK-NEXT:    bne .LBB1_9
-; CHECK-NEXT:  .LBB1_10: @ %for.cond.cleanup
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:  .LBB1_10:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:  .LBB1_11: @ %vector.ph
 ; CHECK-NEXT:    bic r12, r3, #3
 ; CHECK-NEXT:    movs r6, #1
@@ -426,10 +430,11 @@ for.body:                                         ; preds = %for.body.prol.loope
 define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_float_sub:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB2_10
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB2_1: @ %for.body.preheader
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB2_3
 ; CHECK-NEXT:  @ %bb.2:
@@ -502,8 +507,9 @@ define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr noca
 ; CHECK-NEXT:    vsub.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r5, #12]
 ; CHECK-NEXT:    bne .LBB2_9
-; CHECK-NEXT:  .LBB2_10: @ %for.cond.cleanup
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:  .LBB2_10:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:  .LBB2_11: @ %vector.ph
 ; CHECK-NEXT:    bic r12, r3, #3
 ; CHECK-NEXT:    movs r6, #1
@@ -637,10 +643,11 @@ for.body:                                         ; preds = %for.body.prol.loope
 define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_int_mul:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq.w .LBB3_13
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB3_1: @ %for.body.preheader
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bls .LBB3_6
 ; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
@@ -729,8 +736,9 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt
 ; CHECK-NEXT:    vmul.f32 s0, s2, s0
 ; CHECK-NEXT:    vstr s0, [r6, #12]
 ; CHECK-NEXT:    bne .LBB3_12
-; CHECK-NEXT:  .LBB3_13: @ %for.cond.cleanup
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:  .LBB3_13:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index eb98b85eafc90..93119eac2d564 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -411,10 +411,12 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
 ; CHECK-LABEL: two_loops_mul_add_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    beq .LBB6_8
-; CHECK-NEXT:  @ %bb.1: @ %vector.ph
+; CHECK-NEXT:    itt eq
+; CHECK-NEXT:    moveq r0, #0
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB6_1: @ %vector.ph
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #3
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #3
@@ -461,12 +463,10 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block44
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u32 r12, q0
-; CHECK-NEXT:  .LBB6_7: @ %for.cond.cleanup7
+; CHECK-NEXT:  .LBB6_7:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
-; CHECK-NEXT:  .LBB6_8:
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp35 = icmp eq i32 %N, 0
   br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
index caf7a339805fc..1f3a43923db61 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
@@ -4,10 +4,11 @@
 define arm_aapcs_vfpcc void @test(ptr noalias nocapture readonly %off, ptr noalias nocapture %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: test:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    cmp r3, #1
-; CHECK-NEXT:    blt .LBB0_7
-; CHECK-NEXT:  @ %bb.1: @ %for.cond1.preheader.us.preheader
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB0_1: @ %for.cond1.preheader.us.preheader
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    mov r8, r3
 ; CHECK-NEXT:    lsl.w r12, r3, #1
 ; CHECK-NEXT:    movs r3, #0
@@ -47,8 +48,9 @@ define arm_aapcs_vfpcc void @test(ptr noalias nocapture readonly %off, ptr noali
 ; CHECK-NEXT:    add r4, r12
 ; CHECK-NEXT:    cmp r3, r8
 ; CHECK-NEXT:    bne .LBB0_2
-; CHECK-NEXT:  .LBB0_7: @ %for.cond.cleanup
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:  @ %bb.7:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp252 = icmp sgt i32 %n, 0
   br i1 %cmp252, label %for.cond1.preheader.us, label %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
index 9ef5a46edf934..be1f1de71be3d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
@@ -5,17 +5,19 @@
 define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture %phwTargetBase, i16 signext %iTargetStride, ptr noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) {
 ; CHECK-LABEL: __arm_2d_impl_rgb16_colour_filling_with_alpha:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrsh.w r12, [r2, #2]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB0_1: @ %for.cond3.preheader.lr.ph
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    sub sp, #64
-; CHECK-NEXT:    ldrsh.w r12, [r2, #2]
-; CHECK-NEXT:    cmp.w r12, #1
-; CHECK-NEXT:    itt ge
-; CHECK-NEXT:    ldrshge.w r7, [r2]
-; CHECK-NEXT:    cmpge r7, #1
-; CHECK-NEXT:    blt.w .LBB0_5
-; CHECK-NEXT:  @ %bb.1: @ %for.cond3.preheader.us.preheader
+; CHECK-NEXT:    ldrsh.w r7, [r2]
+; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    blt.w .LBB0_6
+; CHECK-NEXT:  @ %bb.2: @ %for.cond3.preheader.us.preheader
 ; CHECK-NEXT:    movs r2, #252
 ; CHECK-NEXT:    ldr r4, [sp, #152]
 ; CHECK-NEXT:    and.w r6, r2, r3, lsr #3
@@ -46,14 +48,14 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:  .LBB0_2: @ %vector.ph
+; CHECK-NEXT:  .LBB0_3: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r6, r7
 ; CHECK-NEXT:    dls lr, r3
-; CHECK-NEXT:  .LBB0_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:  .LBB0_4: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.16 r6
 ; CHECK-NEXT:    subs r6, #8
@@ -89,18 +91,19 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
 ; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrht.16 q0, [r5], #16
-; CHECK-NEXT:    le lr, .LBB0_3
-; CHECK-NEXT:  @ %bb.4: @ %for.cond3.for.cond.cleanup7_crit_edge.us
-; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB0_4
+; CHECK-NEXT:  @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    adds r4, #1
 ; CHECK-NEXT:    add.w r0, r0, r1, lsl #1
 ; CHECK-NEXT:    cmp r4, r12
-; CHECK-NEXT:    bne .LBB0_2
-; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
+; CHECK-NEXT:    bne .LBB0_3
+; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %iHeight = getelementptr inbounds %struct.arm_2d_size_t, ptr %ptCopySize, i32 0, i32 1
   %0 = load i16, ptr %iHeight, align 2
@@ -184,18 +187,19 @@ for.cond.cleanup:                                 ; preds = %for.cond3.for.cond.
 define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias nocapture %phwTargetBase, i16 signext %iTargetStride, ptr noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) "target-cpu"="cortex-m55" {
 ; CHECK-LABEL: __arm_2d_impl_rgb16_colour_filling_with_alpha_sched:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    sub sp, #80
 ; CHECK-NEXT:    ldrsh.w r12, [r2, #2]
 ; CHECK-NEXT:    cmp.w r12, #1
-; CHECK-NEXT:    blt.w .LBB1_6
+; CHECK-NEXT:    blt.w .LBB1_7
 ; CHECK-NEXT:  @ %bb.1: @ %for.cond3.preheader.lr.ph
 ; CHECK-NEXT:    ldrsh.w r2, [r2]
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB1_6
-; CHECK-NEXT:  @ %bb.2: @ %for.cond3.preheader.us.preheader
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB1_2: @ %for.cond3.preheader.us.preheader
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    sub sp, #80
 ; CHECK-NEXT:    ldr r7, [sp, #168]
 ; CHECK-NEXT:    movs r5, #120
 ; CHECK-NEXT:    lsls r6, r3, #3
@@ -265,11 +269,13 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK-NEXT:    adds r4, #1
 ; CHECK-NEXT:    cmp r4, r12
 ; CHECK-NEXT:    bne .LBB1_3
-; CHECK-NEXT:  .LBB1_6: @ %for.cond.cleanup
+; CHECK-NEXT:  @ %bb.6:
 ; CHECK-NEXT:    add sp, #80
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
+; CHECK-NEXT:  .LBB1_7: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
 entry:
   %iHeight = getelementptr inbounds %struct.arm_2d_size_t, ptr %ptCopySize, i32 0, i32 1
   %0 = load i16, ptr %iHeight, align 2

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
index fc58873f9857b..3b42ee36e7c2e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -53,10 +53,12 @@ if.end:                                           ; preds = %do.body, %entry
 define void @nested(ptr nocapture readonly %x, ptr nocapture readnone %y, ptr nocapture %z, i32 %m, i32 %n) {
 ; CHECK-LABEL: nested:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB1_1: @ %for.body.preheader
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    cbz r3, .LBB1_8
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    ldr.w r12, [sp, #24]
 ; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    b .LBB1_4
@@ -91,8 +93,9 @@ define void @nested(ptr nocapture readonly %x, ptr nocapture readnone %y, ptr no
 ; CHECK-NEXT:    sub.w r12, r12, r5
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    b .LBB1_3
-; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:  .LBB1_8:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp20.not = icmp eq i32 %m, 0
   br i1 %cmp20.not, label %for.cond.cleanup, label %for.body

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 6228d616b5842..b7b19a477ab0f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -981,6 +981,13 @@ if.end61:                                         ; preds = %if.then59, %while.e
 define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
 ; CHECK-LABEL: fir:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #8
+; CHECK-NEXT:    blo.w .LBB16_13
+; CHECK-NEXT:  @ %bb.1: @ %if.then
+; CHECK-NEXT:    lsrs.w r12, r3, #2
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:  .LBB16_2: @ %while.body.lr.ph
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
@@ -989,12 +996,6 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    .pad #32
 ; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    cmp r3, #8
-; CHECK-NEXT:    blo.w .LBB16_12
-; CHECK-NEXT:  @ %bb.1: @ %if.then
-; CHECK-NEXT:    lsrs.w r12, r3, #2
-; CHECK-NEXT:    beq.w .LBB16_12
-; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
 ; CHECK-NEXT:    ldrh r6, [r0]
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    ldrd r4, r10, [r0, #4]
@@ -1106,11 +1107,13 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
 ; CHECK-NEXT:    b .LBB16_4
-; CHECK-NEXT:  .LBB16_12: @ %if.end
+; CHECK-NEXT:  .LBB16_12:
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:  .LBB16_13: @ %if.end
+; CHECK-NEXT:    bx lr
 entry:
   %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
   %i = load ptr, ptr %pState1, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 24f1831a3f07c..0335d24c0a782 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -290,12 +290,12 @@ end:
 define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: gather_inc_v4i32_simple:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    poplt {r4, pc}
+; CHECK-NEXT:    bxlt lr
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph.preheader
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
@@ -319,8 +319,9 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    @ in Loop: Header=BB8_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
 ; CHECK-NEXT:    bne .LBB8_2
-; CHECK-NEXT:  @ %bb.5: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:  @ %bb.5:
+; CHECK-NEXT:    pop.w {r4, lr}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
 ; CHECK-NEXT:  .LCPI8_0:
@@ -359,13 +360,14 @@ for.cond.cleanup:                                 ; preds = %for.body, %middle.b
 define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: gather_inc_v4i32_complex:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB9_1: @ %vector.ph.preheader
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB9_5
-; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
@@ -401,9 +403,10 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
 ; CHECK-NEXT:    bne .LBB9_2
-; CHECK-NEXT:  .LBB9_5: @ %for.cond.cleanup
+; CHECK-NEXT:  @ %bb.5:
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r7, lr}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
 ; CHECK-NEXT:  .LCPI9_0:
@@ -461,12 +464,12 @@ for.cond.cleanup:                                 ; preds = %for.body, %middle.b
 define arm_aapcs_vfpcc void @gather_inc_v4i32_large(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: gather_inc_v4i32_large:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    poplt {r4, pc}
+; CHECK-NEXT:    bxlt lr
 ; CHECK-NEXT:  .LBB10_1: @ %vector.ph.preheader
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
@@ -490,8 +493,9 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(ptr noalias nocapture readon
 ; CHECK-NEXT:    @ in Loop: Header=BB10_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
 ; CHECK-NEXT:    bne .LBB10_2
-; CHECK-NEXT:  @ %bb.5: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:  @ %bb.5:
+; CHECK-NEXT:    pop.w {r4, lr}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
 ; CHECK-NEXT:  .LCPI10_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
index 9093b9af00656..ea186cd6ed2d4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
@@ -4,12 +4,12 @@
 define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: gather_inc_v4i32_simple:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    poplt {r4, pc}
+; CHECK-NEXT:    bxlt lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph.preheader
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
@@ -33,8 +33,9 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
 ; CHECK-NEXT:    bne .LBB0_2
-; CHECK-NEXT:  @ %bb.5: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:  @ %bb.5:
+; CHECK-NEXT:    pop.w {r4, lr}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
 ; CHECK-NEXT:  .LCPI0_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
index 5f3a12711dc0f..da59cb259db61 100644
--- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
@@ -211,12 +211,12 @@ entry:
 define void @test11(ptr nocapture %x, ptr nocapture %y, i32 %n) {
 ; CHECK-LABEL: test11:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    cmp.w r2, #-1
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    popgt {r4, pc}
+; CHECK-NEXT:    bxgt lr
 ; CHECK-NEXT:  .LBB10_1: @ %prehead
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    mov r12, r1
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    wlstp.8 lr, r2, .LBB10_3
@@ -230,8 +230,9 @@ define void @test11(ptr nocapture %x, ptr nocapture %y, i32 %n) {
 ; CHECK-NEXT:    subs r2, #2
 ; CHECK-NEXT:    strb r3, [r1], #1
 ; CHECK-NEXT:    bne .LBB10_3
-; CHECK-NEXT:  @ %bb.4: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:  @ %bb.4:
+; CHECK-NEXT:    pop.w {r4, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp6 = icmp slt i32 %n, 0
   br i1 %cmp6, label %prehead, label %for.cond.cleanup
@@ -440,12 +441,12 @@ declare void @other()
 define void @multilooped_exit(i32 %b) {
 ; CHECK-LABEL: multilooped_exit:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    cmp r0, #1
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    poplt {r4, pc}
+; CHECK-NEXT:    bxlt lr
 ; CHECK-NEXT:  .LBB18_1: @ %loop.preheader
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    mov.w r4, #-1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    b .LBB18_3
@@ -498,8 +499,9 @@ define void @multilooped_exit(i32 %b) {
 ; CHECK-NEXT:    vstrb.8 q0, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB18_11
 ; CHECK-NEXT:    b .LBB18_2
-; CHECK-NEXT:  .LBB18_12: @ %exit
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:  .LBB18_12:
+; CHECK-NEXT:    pop.w {r4, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %cmp8 = icmp sgt i32 %b, 0
   br i1 %cmp8, label %loop, label %exit

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index 7e059ae726fc6..45bb70ec44b73 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -6,13 +6,14 @@
 define void @DCT_mve1(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
 ; CHECK-LABEL: DCT_mve1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    ldr r3, [r0, #4]
 ; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    cmp.w r12, #2
-; CHECK-NEXT:    blo .LBB0_5
-; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    bxlo lr
+; CHECK-NEXT:  .LBB0_1: @ %for.body.preheader
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    ldr r5, [r0, #8]
 ; CHECK-NEXT:    ldr r3, [r0]
 ; CHECK-NEXT:    add.w r3, r3, r5, lsl #2
@@ -43,8 +44,9 @@ define void @DCT_mve1(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vstr s0, [r7]
 ; CHECK-NEXT:    bne .LBB0_2
-; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:  @ %bb.5:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
   %i = load i32, ptr %NumInputs, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index 94397f0ae587b..3a14e650bd53a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -127,15 +127,16 @@ define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, ptr %dst, <1
 define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i32> %data2, <4 x i32> %data3, ptr %dst, i32 %n) {
 ; CHECK-LABEL: scatter_inc_v4i32_complex:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r1, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB3_1: @ %vector.ph.preheader
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    cmp r1, #1
-; CHECK-NEXT:    blt .LBB3_5
-; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
 ; CHECK-NEXT:    adr r4, .LCPI3_2
 ; CHECK-NEXT:    bic r2, r1, #3
 ; CHECK-NEXT:    vldrw.u32 q3, [r4]
@@ -168,10 +169,11 @@ define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i3
 ; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    bne .LBB3_2
-; CHECK-NEXT:  .LBB3_5: @ %for.cond.cleanup
+; CHECK-NEXT:  @ %bb.5:
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop.w {r4, lr}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
 ; CHECK-NEXT:  .LCPI3_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll
index 85425db1eb6c8..42a00b61b4183 100644
--- a/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll
@@ -58,11 +58,12 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 define arm_aapcs_vfpcc void @start11(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
 ; CHECK-LABEL: start11:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB1_1: @ %vector.ph
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    cmp r3, #1
-; CHECK-NEXT:    blt .LBB1_3
-; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    adds r4, r3, #3
 ; CHECK-NEXT:    adr r5, .LCPI1_0
@@ -85,8 +86,9 @@ define arm_aapcs_vfpcc void @start11(ptr nocapture readonly %x, ptr nocapture re
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q3, [r2], #16
 ; CHECK-NEXT:    bne .LBB1_2
-; CHECK-NEXT:  .LBB1_3: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:    pop.w {r4, r5, r7, lr}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI1_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
index da0cd57d86dbb..0a26d9920981b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
@@ -4,11 +4,13 @@
 define arm_aapcs_vfpcc void @test32(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, ptr nocapture %z, i32 %n) {
 ; CHECK-LABEL: test32:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB0_1: @ %vector.body.preheader
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    cmp r3, #1
-; CHECK-NEXT:    blt .LBB0_2
-; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
@@ -26,9 +28,10 @@ define arm_aapcs_vfpcc void @test32(ptr noalias nocapture readonly %x, ptr noali
 ; CHECK-NEXT:    lsrl r4, r5, #31
 ; CHECK-NEXT:    vmov q2[3], q2[1], r4, r12
 ; CHECK-NEXT:    vstrb.8 q2, [r2], #16
-; CHECK-NEXT:    bne .LBB0_1
-; CHECK-NEXT:  .LBB0_2: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    bne .LBB0_2
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:    pop.w {r4, r5, r7, lr}
+; CHECK-NEXT:    bx lr
 entry:
   %0 = and i32 %n, 3
   %cmp = icmp eq i32 %0, 0

diff  --git a/llvm/test/CodeGen/X86/fold-call-3.ll b/llvm/test/CodeGen/X86/fold-call-3.ll
index 9c9a50d3e9ce1..691f46b9eeb0e 100644
--- a/llvm/test/CodeGen/X86/fold-call-3.ll
+++ b/llvm/test/CodeGen/X86/fold-call-3.ll
@@ -13,12 +13,12 @@
 define void @_Z25RawPointerPerformanceTestPvRN5clang6ActionE(ptr %Val, ptr %Actions) nounwind {
 ; CHECK-LABEL: _Z25RawPointerPerformanceTestPvRN5clang6ActionE:
 ; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    cmpl $0, _NumTrials(%rip)
+; CHECK-NEXT:    je LBB0_4
+; CHECK-NEXT:  ## %bb.1: ## %bb.nph
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    cmpl $0, _NumTrials(%rip)
-; CHECK-NEXT:    je LBB0_3
-; CHECK-NEXT:  ## %bb.1: ## %bb.nph
 ; CHECK-NEXT:    movq %rsi, %rbx
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    xorl %ebp, %ebp
@@ -34,20 +34,21 @@ define void @_Z25RawPointerPerformanceTestPvRN5clang6ActionE(ptr %Val, ptr %Acti
 ; CHECK-NEXT:    incl %ebp
 ; CHECK-NEXT:    cmpl _NumTrials(%rip), %ebp
 ; CHECK-NEXT:    jb LBB0_2
-; CHECK-NEXT:  LBB0_3: ## %return
+; CHECK-NEXT:  ## %bb.3:
 ; CHECK-NEXT:    addq $24, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  LBB0_4: ## %return
 ; CHECK-NEXT:    retq
 ;
 ; pre-RA-LABEL: _Z25RawPointerPerformanceTestPvRN5clang6ActionE:
 ; pre-RA:       ## %bb.0: ## %entry
+; pre-RA-NEXT:    cmpl $0, _NumTrials(%rip)
+; pre-RA-NEXT:    je LBB0_4
+; pre-RA-NEXT:  ## %bb.1: ## %bb.nph
 ; pre-RA-NEXT:    pushq %rbp
 ; pre-RA-NEXT:    pushq %rbx
 ; pre-RA-NEXT:    subq $24, %rsp
-; pre-RA-NEXT:    cmpl $0, _NumTrials(%rip)
-; pre-RA-NEXT:    je LBB0_3
-; pre-RA-NEXT:  ## %bb.1: ## %bb.nph
 ; pre-RA-NEXT:    movq %rsi, %rbx
 ; pre-RA-NEXT:    movq %rdi, %rax
 ; pre-RA-NEXT:    xorl %ebp, %ebp
@@ -63,10 +64,11 @@ define void @_Z25RawPointerPerformanceTestPvRN5clang6ActionE(ptr %Val, ptr %Acti
 ; pre-RA-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; pre-RA-NEXT:    cmpl _NumTrials(%rip), %ebp
 ; pre-RA-NEXT:    jb LBB0_2
-; pre-RA-NEXT:  LBB0_3: ## %return
+; pre-RA-NEXT:  ## %bb.3:
 ; pre-RA-NEXT:    addq $24, %rsp
 ; pre-RA-NEXT:    popq %rbx
 ; pre-RA-NEXT:    popq %rbp
+; pre-RA-NEXT:  LBB0_4: ## %return
 ; pre-RA-NEXT:    retq
 entry:
   %i = alloca %"struct.clang::ActionBase::ActionResult<0u>", align 8

diff  --git a/llvm/test/CodeGen/X86/negative-stride-fptosi-user.ll b/llvm/test/CodeGen/X86/negative-stride-fptosi-user.ll
index e21d4de178719..d0d46b5f11836 100644
--- a/llvm/test/CodeGen/X86/negative-stride-fptosi-user.ll
+++ b/llvm/test/CodeGen/X86/negative-stride-fptosi-user.ll
@@ -9,12 +9,14 @@
 define void @foo(i32 %N) nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    js .LBB0_1
+; CHECK-NEXT:  # %bb.4: # %return
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_1: # %bb.preheader
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    jns .LBB0_3
-; CHECK-NEXT:  # %bb.1: # %bb.preheader
 ; CHECK-NEXT:    movl %edi, %ebx
 ; CHECK-NEXT:    xorl %ebp, %ebp
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -26,7 +28,7 @@ define void @foo(i32 %N) nounwind {
 ; CHECK-NEXT:    decl %ebp
 ; CHECK-NEXT:    cmpl %ebp, %ebx
 ; CHECK-NEXT:    jne .LBB0_2
-; CHECK-NEXT:  .LBB0_3: # %return
+; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    addq $8, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %rbp

diff  --git a/llvm/test/CodeGen/X86/pr44412.ll b/llvm/test/CodeGen/X86/pr44412.ll
index 6c33666fb5c3a..67579a5bb7c52 100644
--- a/llvm/test/CodeGen/X86/pr44412.ll
+++ b/llvm/test/CodeGen/X86/pr44412.ll
@@ -4,10 +4,10 @@
 define void @bar(i32 %0, i32 %1) nounwind {
 ; CHECK-LABEL: bar:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    je .LBB0_3
+; CHECK-NEXT:    je .LBB0_4
 ; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %edi, %ebx
 ; CHECK-NEXT:    decl %ebx
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -16,8 +16,9 @@ define void @bar(i32 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    callq foo at PLT
 ; CHECK-NEXT:    addl $-1, %ebx
 ; CHECK-NEXT:    jb .LBB0_2
-; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:  .LBB0_4:
 ; CHECK-NEXT:    retq
   %3 = icmp eq i32 %0, 0
   br i1 %3, label %8, label %4
@@ -36,10 +37,10 @@ define void @bar(i32 %0, i32 %1) nounwind {
 define void @baz(i32 %0, i32 %1) nounwind {
 ; CHECK-LABEL: baz:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    je .LBB1_3
+; CHECK-NEXT:    je .LBB1_4
 ; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %edi, %ebx
 ; CHECK-NEXT:    decl %ebx
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -48,8 +49,9 @@ define void @baz(i32 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    callq foo at PLT
 ; CHECK-NEXT:    addl $-1, %ebx
 ; CHECK-NEXT:    jae .LBB1_2
-; CHECK-NEXT:  .LBB1_3:
+; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:  .LBB1_4:
 ; CHECK-NEXT:    retq
   %3 = icmp eq i32 %0, 0
   br i1 %3, label %8, label %4

diff  --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index ec4a12eadb94e..f22ea739092f6 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -639,40 +639,40 @@ declare hidden fastcc ptr @find_temp_slot_from_address(ptr readonly)
 define void @useLEA(ptr readonly %x) {
 ; ENABLE-LABEL: useLEA:
 ; ENABLE:       ## %bb.0: ## %entry
-; ENABLE-NEXT:    pushq %rax
-; ENABLE-NEXT:    .cfi_def_cfa_offset 16
 ; ENABLE-NEXT:    testq %rdi, %rdi
-; ENABLE-NEXT:    je LBB8_7
+; ENABLE-NEXT:    je LBB8_9
 ; ENABLE-NEXT:  ## %bb.1: ## %if.end
 ; ENABLE-NEXT:    cmpw $66, (%rdi)
-; ENABLE-NEXT:    jne LBB8_7
+; ENABLE-NEXT:    jne LBB8_9
 ; ENABLE-NEXT:  ## %bb.2: ## %lor.lhs.false
+; ENABLE-NEXT:    pushq %rax
+; ENABLE-NEXT:    .cfi_def_cfa_offset 16
 ; ENABLE-NEXT:    movq 8(%rdi), %rdi
 ; ENABLE-NEXT:    movzwl (%rdi), %eax
 ; ENABLE-NEXT:    leal -54(%rax), %ecx
 ; ENABLE-NEXT:    cmpl $14, %ecx
 ; ENABLE-NEXT:    ja LBB8_3
-; ENABLE-NEXT:  ## %bb.8: ## %lor.lhs.false
+; ENABLE-NEXT:  ## %bb.7: ## %lor.lhs.false
 ; ENABLE-NEXT:    movl $24599, %edx ## imm = 0x6017
 ; ENABLE-NEXT:    btl %ecx, %edx
 ; ENABLE-NEXT:    jae LBB8_3
-; ENABLE-NEXT:  LBB8_7: ## %cleanup
-; ENABLE-NEXT:    popq %rax
+; ENABLE-NEXT:  LBB8_8:
+; ENABLE-NEXT:    addq $8, %rsp
+; ENABLE-NEXT:  LBB8_9: ## %cleanup
 ; ENABLE-NEXT:    retq
 ; ENABLE-NEXT:  LBB8_3: ## %lor.lhs.false
 ; ENABLE-NEXT:    cmpl $134, %eax
-; ENABLE-NEXT:    je LBB8_7
+; ENABLE-NEXT:    je LBB8_8
 ; ENABLE-NEXT:  ## %bb.4: ## %lor.lhs.false
 ; ENABLE-NEXT:    cmpl $140, %eax
-; ENABLE-NEXT:    je LBB8_7
+; ENABLE-NEXT:    je LBB8_8
 ; ENABLE-NEXT:  ## %bb.5: ## %if.end.55
 ; ENABLE-NEXT:    callq _find_temp_slot_from_address
 ; ENABLE-NEXT:    testq %rax, %rax
-; ENABLE-NEXT:    je LBB8_7
+; ENABLE-NEXT:    je LBB8_8
 ; ENABLE-NEXT:  ## %bb.6: ## %if.then.60
 ; ENABLE-NEXT:    movb $1, 57(%rax)
-; ENABLE-NEXT:    popq %rax
-; ENABLE-NEXT:    retq
+; ENABLE-NEXT:    jmp LBB8_8
 ;
 ; DISABLE-LABEL: useLEA:
 ; DISABLE:       ## %bb.0: ## %entry

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
index 2069e974c6905..536f9912f1b6f 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
@@ -23,7 +23,7 @@ define i32 @test(i32 %c, ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_5:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %cmp13 = icmp sgt i32 %c, 0
@@ -62,7 +62,7 @@ define i64 @IVIncHoist_not_all_user_in_header(i32 %c, ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    add x10, x1, #4
 ; CHECK-NEXT:    add x11, x2, #8
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:  .LBB1_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr w12, [x10, x8, lsl #2]
@@ -142,7 +142,7 @@ define i32 @negative_test_type_is_struct(i32 %c, ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_5:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   %cmp13 = icmp sgt i32 %c, 0

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index fa1c208ffbd77..63a3c725ae89e 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -182,12 +182,12 @@ exit:
 define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
 ; X64-LABEL: extrastride:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    testl %r9d, %r9d
-; X64-NEXT:    je .LBB2_3
+; X64-NEXT:    je .LBB2_4
 ; X64-NEXT:  # %bb.1: # %for.body.lr.ph
+; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    leal (%rsi,%rsi), %r10d
 ; X64-NEXT:    leal (%rsi,%rsi,2), %r11d
 ; X64-NEXT:    addl %esi, %ecx
@@ -213,8 +213,9 @@ define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %
 ; X64-NEXT:    addq %r8, %rdx
 ; X64-NEXT:    decl %r9d
 ; X64-NEXT:    jne .LBB2_2
-; X64-NEXT:  .LBB2_3: # %for.end
+; X64-NEXT:  # %bb.3:
 ; X64-NEXT:    popq %rbx
+; X64-NEXT:  .LBB2_4: # %for.end
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: extrastride:


        


More information about the llvm-commits mailing list