[llvm] a294d9e - Revert "[IPRA][ARM] Spill extra registers at -Oz"

Mon Apr 6 02:35:09 PDT 2020

Author: Oliver Stannard
Date: 2020-04-06T10:34:59+01:00
New Revision: a294d9eb2152ccd5c44b9e45ad291a199a944c56

URL: https://github.com/llvm/llvm-project/commit/a294d9eb2152ccd5c44b9e45ad291a199a944c56
DIFF: https://github.com/llvm/llvm-project/commit/a294d9eb2152ccd5c44b9e45ad291a199a944c56.diff

LOG: Revert "[IPRA][ARM] Spill extra registers at -Oz"

Reverting because this is causing failures on bots with expensive checks
enabled.

This reverts commit 73cea83a6f5ab521edf3cccfc603534776d691ec.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/MachineRegisterInfo.h
    llvm/lib/CodeGen/MachineRegisterInfo.cpp
    llvm/lib/CodeGen/PrologEpilogInserter.cpp
    llvm/lib/Target/ARM/ARMFrameLowering.cpp
    llvm/lib/Target/ARM/ARMFrameLowering.h
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
    llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll

Removed: 
    llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll
    llvm/test/CodeGen/ARM/ipra-extra-spills.ll


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 064b6075c095..c5a90b0c46e3 100644

--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -91,8 +91,6 @@ class MachineRegisterInfo {
   /// all registers that were disabled are removed from the list.
   SmallVector<MCPhysReg, 16> UpdatedCSRs;
 
-  void initUpdatedCSRs();
-
   /// RegAllocHints - This vector records register allocation hints for
   /// virtual registers. For each virtual register, it keeps a pair of hint
   /// type and hints vector making up the allocation hints. Only the first
@@ -233,17 +231,12 @@ class MachineRegisterInfo {
 
   /// Disables the register from the list of CSRs.
   /// I.e. the register will not appear as part of the CSR mask.
-  /// \see UpdatedCSRs.
-  void disableCalleeSavedRegister(Register Reg);
-
-  /// Enables the register from the list of CSRs.
-  /// I.e. the register will appear as part of the CSR mask.
-  /// \see UpdatedCSRs.
-  void enableCalleeSavedRegister(Register Reg);
+  /// \see UpdatedCalleeSavedRegs.
+  void disableCalleeSavedRegister(unsigned Reg);
 
   /// Returns list of callee saved registers.
   /// The function returns the updated CSR list (after taking into account
-  /// registers that are enabled/disabled from the CSR list).
+  /// registers that are disabled from the CSR list).
   const MCPhysReg *getCalleeSavedRegs() const;
 
   /// Sets the updated Callee Saved Registers list.

diff  --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 12f1bafe1fb2..270ba125df00 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -610,54 +610,30 @@ bool MachineRegisterInfo::isPhysRegUsed(MCRegister PhysReg) const {
   return false;
 }
 
-void MachineRegisterInfo::initUpdatedCSRs() {
-  if (IsUpdatedCSRsInitialized)
-    return;
-
-  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
-  const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
-  for (const MCPhysReg *I = CSR; *I; ++I)
-    UpdatedCSRs.push_back(*I);
-
-  // Zero value represents the end of the register list
-  // (no more registers should be pushed).
-  UpdatedCSRs.push_back(0);
-
-  IsUpdatedCSRsInitialized = true;
-}
+void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) {
 
-void MachineRegisterInfo::disableCalleeSavedRegister(Register Reg) {
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   assert(Reg && (Reg < TRI->getNumRegs()) &&
          "Trying to disable an invalid register");
 
-  initUpdatedCSRs();
+  if (!IsUpdatedCSRsInitialized) {
+    const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
+    for (const MCPhysReg *I = CSR; *I; ++I)
+      UpdatedCSRs.push_back(*I);
+
+    // Zero value represents the end of the register list
+    // (no more registers should be pushed).
+    UpdatedCSRs.push_back(0);
 
-  // Remove the register (and its aliases) from the CSR list.
+    IsUpdatedCSRsInitialized = true;
+  }
+
+  // Remove the register (and its aliases from the list).
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
     UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI),
                       UpdatedCSRs.end());
 }
 
-void MachineRegisterInfo::enableCalleeSavedRegister(Register Reg) {
-  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
-  assert(Reg && (Reg < TRI->getNumRegs()) &&
-         "Trying to disable an invalid register");
-
-  initUpdatedCSRs();
-
-  // Remove the null terminator from the end of the list.
-  assert(UpdatedCSRs.back() == 0);
-  UpdatedCSRs.pop_back();
-
-  // Add the register (and its sub-registers) to the CSR list.
-  for (MCSubRegIterator SRI(Reg, TRI, true); SRI.isValid(); ++SRI)
-    UpdatedCSRs.push_back(*SRI);
-
-  // Put the null terminator back.
-  UpdatedCSRs.push_back(0);
-}
-
 const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const {
   if (IsUpdatedCSRsInitialized)
     return UpdatedCSRs.data();

diff  --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index b3cece0223b5..32e2deec353c 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -451,8 +451,6 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
         FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset);
       }
 
-      LLVM_DEBUG(dbgs() << "Assigned " << RegInfo->getName(Reg)
-                        << " to spill slot " << FrameIdx << "\n");
       CS.setFrameIdx(FrameIdx);
     }
   }

diff  --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index f9203f0e453e..76e516279487 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -71,14 +71,6 @@ static cl::opt<bool>
 SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true),
                      cl::desc("Align ARM NEON spills in prolog and epilog"));
 
-static cl::opt<bool> EnableExtraSpills(
-    "arm-extra-spills", cl::Hidden, cl::init(false),
-    cl::desc("Preserve extra registers when useful for IPRA"));
-
-// Testing option to bypass some profitability checks.
-static cl::opt<bool> ForceExtraSpills("arm-extra-spills-force", cl::Hidden,
-                                      cl::init(false));
-
 static MachineBasicBlock::iterator
 skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
                         unsigned NumAlignedDPRCS2Regs);
@@ -1625,251 +1617,6 @@ checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
   SavedRegs.set(ARM::R4);
 }
 
-// Compute the set of registers which cannot be preserved, because they are
-// either modified outside the PUSH/POP instructions, or are live at the point
-// where the POP will be inserted. This only considers r0-r3, which are
-// currently the only registers we voluntatrily save when the PCS doesn't
-// require it.
-void ARMFrameLowering::findRegDefsOutsideSaveRestore(
-    MachineFunction &MF, BitVector &UnsaveableRegs) const {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  SmallSet<MachineBasicBlock *, 2> SaveBlocks;
-  SmallSet<MachineBasicBlock *, 2> RestoreBlocks;
-
-  if (MFI.getSavePoint()) {
-    SaveBlocks.insert(MFI.getSavePoint());
-    RestoreBlocks.insert(MFI.getRestorePoint());
-  } else {
-    SaveBlocks.insert(&MF.front());
-    for (MachineBasicBlock &MBB : MF)
-      if (MBB.isReturnBlock())
-        RestoreBlocks.insert(&MBB);
-  }
-
-  // Walk blocks from the function entry and exits (following control flow both
-  // ways), stopping when we get to a save/restore block. Check for
-  // instructions which modify any of the registers we care about.
-  SmallVector<MachineBasicBlock *, 4> WorkList;
-  SmallSet<MachineBasicBlock *, 4> VisitedBlocks;
-  LLVM_DEBUG(dbgs() << "Entry block: " << MF.front().getName() << "\n");
-  WorkList.push_back(&MF.front());
-  for (MachineBasicBlock &MBB : MF) {
-    if (MBB.isReturnBlock()) {
-      LLVM_DEBUG(dbgs() << "Return block: " << MBB.getName() << "\n");
-      WorkList.push_back(&MBB);
-    }
-  }
-
-  auto CheckOutsideInst = [&UnsaveableRegs, TRI](MachineInstr &MI) {
-    for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
-      if (MI.modifiesRegister(Reg, TRI)) {
-        UnsaveableRegs.set(Reg);
-        LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg)
-                          << " modified by instruction " << MI << "\n");
-      }
-    }
-  };
-
-  while (!WorkList.empty()) {
-    MachineBasicBlock *MBB = WorkList.pop_back_val();
-
-    if (VisitedBlocks.count(MBB))
-      continue;
-    VisitedBlocks.insert(MBB);
-
-    bool IsSave = SaveBlocks.count(MBB);
-    bool IsRestore = RestoreBlocks.count(MBB);
-
-    LLVM_DEBUG(dbgs() << "Visiting block " << MBB->getName() << ", IsSave="
-                      << IsSave << ", IsRestore=" << IsRestore << "\n");
-
-    // If this is a restore block, the POP instruction will be inserted just
-    // before the terminator, so we need to consider any terminator
-    // instructions to be outside the preserved region. We also need to check
-    // for registers which are live at the POP insertion point, because these
-    // can't be restored without changing their value.
-    if (IsRestore) {
-      LivePhysRegs LPR(*TRI);
-      LPR.addLiveOuts(*MBB);
-      for (auto &Term : reverse(MBB->terminators())) {
-        LPR.stepBackward(Term);
-        CheckOutsideInst(Term);
-      }
-
-      for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
-        if (LPR.contains(Reg)) {
-          UnsaveableRegs.set(Reg);
-          LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg)
-                            << " live-out of restore block " << MBB->getName()
-                            << "\n");
-        }
-      }
-    }
-
-    // If this block is completely outside the save/restore region, then any
-    // modified registers can't be preserved. A save block counts as being
-    // inside the saved region, with the possible exception of the last few
-    // instructions if it's also a restore block, handled above. We don't visit
-    // blocks which are completely inside the saved region and don't have any
-    // save/restore instructions, so don't need to check that here.
-    if (!IsSave && !IsRestore)
-      for (auto &MI : *MBB)
-        CheckOutsideInst(MI);
-
-    // Walk the control flow graph in both directions, except for blocks which
-    // are inside the PUSH/POP region.
-    if (IsSave || !IsRestore)
-      for (auto Pred : MBB->predecessors())
-        WorkList.push_back(Pred);
-    if (!IsSave || IsRestore)
-      for (auto Succ : MBB->successors())
-        WorkList.push_back(Succ);
-  }
-}
-
-bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
-  // Shrink wrapping is detrimental to code size because it prevents merging
-  // the CSR restore and function return into one POP instruction. It also
-  // conflicts with saving extra registers for IPRA, because it makes more
-  // registers live at the PUSH/POP.
-  if (MF.getFunction().hasMinSize())
-    return false;
-
-  return true;
-}
-
-// When doing inter-procedural register allocation, saving extra registers in
-// [r0,r3] will allow us to keep live values in them in any callers. The extra
-// saves and restores don't cost us any code-size if we are already emitting
-// PUSH and POP instructions.
-unsigned ARMFrameLowering::spillExtraRegsForIPRA(MachineFunction &MF,
-                                                 BitVector &SavedRegs,
-                                                 bool HasFPRegSaves) const {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  LLVM_DEBUG(dbgs() << "Extra spills for " << MF.getName() << ": ");
-
-  if (!EnableExtraSpills) {
-    LLVM_DEBUG(dbgs() << "optimisation not enabled\n");
-    return 0;
-  }
-
-  // If IPRA is not enabled, nothing will be able to take advantage of the
-  // extra saved registers.
-  if (!MF.getTarget().Options.EnableIPRA) {
-    LLVM_DEBUG(dbgs() << "IPRA disabled\n");
-    return 0;
-  }
-
-  // These registers will take extra time to save and restore, and will often
-  // go unused, so only to this at -Oz.
-  if (!MF.getFunction().hasMinSize()) {
-    LLVM_DEBUG(dbgs() << "not minsize\n");
-    return 0;
-  }
-
-  // If we are not currently spilling any registers, we'd need to add an extra
-  // PUSH/POP pair, so this isn't worth it.
-  if (!SavedRegs.any()) {
-    LLVM_DEBUG(dbgs() << "no existing push/pop\n");
-    return 0;
-  }
-
-  // If we can't guarantee that this definition of the function is the one
-  // which will be picked by the linker, then IPRA can't make use of any extra
-  // saved registers.
-  if (!MF.getFunction().isDefinitionExact()) {
-    LLVM_DEBUG(dbgs() << "inexact definition\n");
-    return 0;
-  }
-
-  int NumVisibleCallers = 0;
-  for (const User *U : MF.getFunction().users()) {
-    if (const CallBase *Call = dyn_cast<CallBase>(U)) {
-      if (Call->getCalledOperand() == &MF.getFunction()) {
-        ++NumVisibleCallers;
-      }
-    }
-  }
-
-  // If we don't have any direct callers in the current translation unit,
-  // nothing will be able to take advantage of the extra saved registers.
-  if (NumVisibleCallers == 0 && !ForceExtraSpills) {
-    LLVM_DEBUG(dbgs() << "no visible callers\n");
-    return 0;
-  }
-
-  // If we need to emit unwind tables, these will be longer if we need to
-  // preserve r0-r3, so we need a lot of visible calls to make this worthwhile.
-  if (MF.getFunction().needsUnwindTableEntry() && NumVisibleCallers <= 8 &&
-      !ForceExtraSpills) {
-    LLVM_DEBUG(dbgs() << "needs unwind table\n");
-    return 0;
-  }
-
-  // Ok, we've decided we are going to try the optimisation.
-  LLVM_DEBUG(dbgs() << "enabled\n");
-
-  // Compute the registers which can't be preserved because they are either
-  // modified before the PUSH or after the POP, or are live at the point where
-  // the POP will be inserted.
-  BitVector NonPreserveableRegisters;
-  NonPreserveableRegisters.resize(TRI->getNumRegs());
-  findRegDefsOutsideSaveRestore(MF, NonPreserveableRegisters);
-
-  unsigned NumExtraRegs = 0;
-
-  // We'd also like to leave some registers free so that we can use them to
-  // fold a small SP update into the PUSH/POP. We can't know exactly what this
-  // optimisation can do, because stack layout isn't finalised, but we can make
-  // a good enough estimate.
-  unsigned StackSize = MFI.estimateStackSize(MF);
-
-  // If the stack space is large, we probably won't be able to fold the SP
-  // update into the push/pop, so we should use all the registers we want. If
-  // we have FP register saves, then the SP update will be folded into the
-  // VPUSH/VPOP instead, and we can use the GPRs freely.
-  if (StackSize > 16 || HasFPRegSaves)
-    StackSize = 0;
-
-  LLVM_DEBUG(dbgs() << "Estimated " << StackSize
-                    << " bytes of SP update being folded into push/pop\n");
-
-  for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
-    if (StackSize) {
-      StackSize -= 4;
-      LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg)
-                        << ", wanted for SP update\n");
-      continue;
-    }
-
-    // If we don't modify the register anywhere in this function, IPRA will
-    // already know that it is preserved, and there's no point in saving it.
-    if (!MRI.isPhysRegModified(Reg)) {
-      LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg)
-                        << ", not modified\n");
-      continue;
-    }
-
-    if (NonPreserveableRegisters[Reg]) {
-      LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg)
-                        << ", modified outide save region\n");
-      continue;
-    }
-
-    LLVM_DEBUG(dbgs() << "also saving " << TRI->getName(Reg) << " for IPRA\n");
-    SavedRegs.set(Reg);
-    MRI.enableCalleeSavedRegister(Reg);
-    ++NumExtraRegs;
-  }
-
-  return NumExtraRegs;
-}
-
 void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             BitVector &SavedRegs,
                                             RegScavenger *RS) const {
@@ -2260,14 +2007,6 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
                         << "\n");
     }
 
-    // When using IPRA, we might want to preserve some of r0-r3, to reduce
-    // register pressure in our callers.
-    unsigned ExtraIPRASpills =
-        spillExtraRegsForIPRA(MF, SavedRegs, NumFPRSpills != 0);
-    NumGPRSpills += ExtraIPRASpills;
-    if (ExtraIPRASpills)
-      CS1Spilled = true;
-
     // Avoid spilling LR in Thumb1 if there's a tail call: it's expensive to
     // restore LR in that case.
     bool ExpensiveLRRestore = AFI->isThumb1OnlyFunction() && MFI.hasTailCall();

diff  --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
index e03e2d0e1cdb..f30f3895d972 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -56,10 +56,6 @@ class ARMFrameLowering : public TargetFrameLowering {
 
   void getCalleeSaves(const MachineFunction &MF,
                       BitVector &SavedRegs) const override;
-  void findRegDefsOutsideSaveRestore(MachineFunction &MF,
-                                     BitVector &Regs) const;
-  unsigned spillExtraRegsForIPRA(MachineFunction &MF, BitVector &SavedRegs,
-                                 bool HasFPRegSaves) const;
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
 
@@ -67,8 +63,9 @@ class ARMFrameLowering : public TargetFrameLowering {
                                 MachineBasicBlock &MBB) const override;
 
   /// Returns true if the target will correctly handle shrink wrapping.
-  bool enableShrinkWrapping(const MachineFunction &MF) const override;
-
+  bool enableShrinkWrapping(const MachineFunction &MF) const override {
+    return true;
+  }
   bool isProfitableForNoCSROpt(const Function &F) const override {
     // The no-CSR optimisation is bad for code size on ARM, because we can save
     // many registers with a single PUSH/POP pair.

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e4d1caa0b1e0..21c486658c0f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2463,24 +2463,25 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask;
-  const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
-  if (isThisReturn) {
-    // For 'this' returns, use the R0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
-    if (!Mask) {
-      // Set isThisReturn to false if the calling convention is not one that
-      // allows 'returned' to be modeled in this way, so LowerCallResult does
-      // not try to pass 'this' straight through
-      isThisReturn = false;
+  if (!isTailCall) {
+    const uint32_t *Mask;
+    const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+    if (isThisReturn) {
+      // For 'this' returns, use the R0-preserving mask if applicable
+      Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
+      if (!Mask) {
+        // Set isThisReturn to false if the calling convention is not one that
+        // allows 'returned' to be modeled in this way, so LowerCallResult does
+        // not try to pass 'this' straight through
+        isThisReturn = false;
+        Mask = ARI->getCallPreservedMask(MF, CallConv);
+      }
+    } else
       Mask = ARI->getCallPreservedMask(MF, CallConv);
-    }
-  } else {
-    Mask = ARI->getCallPreservedMask(MF, CallConv);
-  }
 
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);

diff  --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index df7a3fed6acd..c5ca64b0d78a 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -216,10 +216,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
         break;
       }
       LLVM_FALLTHROUGH;
-    case ARM::R0:
-    case ARM::R1:
-    case ARM::R2:
-    case ARM::R3:
     case ARM::R4:
     case ARM::R5:
     case ARM::R6:
@@ -852,8 +848,7 @@ bool Thumb1FrameLowering::spillCalleeSavedRegisters(
   if (!LoRegsToSave.none()) {
     MachineInstrBuilder MIB =
         BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
-    for (unsigned Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4, ARM::R5,
-                         ARM::R6, ARM::R7, ARM::LR}) {
+    for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) {
       if (LoRegsToSave[Reg]) {
         bool isKill = !MRI.isLiveIn(Reg);
         if (isKill && !MRI.isReserved(Reg))
@@ -961,9 +956,6 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
       llvm_unreachable("callee-saved register of unexpected class");
     }
 
-    if (Reg == ARM::LR)
-      I.setRestored(false);
-
     // If this is a low register not used as the frame pointer, we may want to
     // use it for restoring the high registers.
     if ((ARM::tGPRRegClass.contains(Reg)) &&
@@ -988,9 +980,6 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
   static const unsigned AllCopyRegs[] = {ARM::R0, ARM::R1, ARM::R2, ARM::R3,
                                          ARM::R4, ARM::R5, ARM::R6, ARM::R7};
   static const unsigned AllHighRegs[] = {ARM::R8, ARM::R9, ARM::R10, ARM::R11};
-  static const unsigned AllLoRegs[] = {ARM::R0, ARM::R1, ARM::R2,
-                                       ARM::R3, ARM::R4, ARM::R5,
-                                       ARM::R6, ARM::R7, ARM::LR};
 
   const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs);
   const unsigned *AllHighRegsEnd = std::end(AllHighRegs);
@@ -1029,10 +1018,16 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
       BuildMI(MF, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
 
   bool NeedsPop = false;
-  for (unsigned Reg : AllLoRegs) {
-    if (!LoRegsToRestore[Reg])
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    CalleeSavedInfo &Info = CSI[i-1];
+    unsigned Reg = Info.getReg();
+
+    // High registers (excluding lr) have already been dealt with
+    if (!(ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR))
       continue;
+
     if (Reg == ARM::LR) {
+      Info.setRestored(false);
       if (!MBB.succ_empty() ||
           MI->getOpcode() == ARM::TCRETURNdi ||
           MI->getOpcode() == ARM::TCRETURNri)

diff  --git a/llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll b/llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll
deleted file mode 100644
index ead0278cb362..000000000000
--- a/llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; RUN: llc -mtriple armv7a--none-eabi   -enable-ipra=true -arm-extra-spills -arm-extra-spills-force -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple thumbv7a--none-eabi -enable-ipra=true -arm-extra-spills -arm-extra-spills-force -verify-machineinstrs < %s | FileCheck %s
-
-; Test the interaction between IPRA and C++ exception handling. Currently, IPRA
-; only marks registers as preserved on the non-exceptional return path, not in
-; the landing pad.
-
-declare dso_local i8* @__cxa_allocate_exception(i32) local_unnamed_addr
-declare dso_local void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
-declare dso_local i32 @__gxx_personality_v0(...)
-declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
-declare dso_local i8* @__cxa_begin_catch(i8*) local_unnamed_addr
-declare dso_local void @__cxa_end_catch() local_unnamed_addr
-
- at g = dso_local local_unnamed_addr global i32 0, align 4
- at _ZTIi = external dso_local constant i8*
-
-define dso_local i32 @_Z11maybe_throwv() minsize {
-; This function might return normally, or might throw an exception. r0 is used
-; for a return value, we can preserve r1-r3 for IPRA.
-; CHECK:      .save   {r1, r2, r3, lr}
-; CHECK-NEXT: push    {r1, r2, r3, lr}
-; CHECK:      pop{{(..)?}}    {r1, r2, r3, pc}
-entry:
-  %0 = load i32, i32* @g, align 4
-  %tobool = icmp eq i32 %0, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %entry
-  %exception = tail call i8* @__cxa_allocate_exception(i32 4)
-  %1 = bitcast i8* %exception to i32*
-  store i32 42, i32* %1, align 8
-  tail call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
-  unreachable
-
-if.else:                                          ; preds = %entry
-  ret i32 1337
-}
-
-; Use inline assembly to force r0-r3 to be alive across a potentially throwing
-; call, using them on the non-exceptional return path. r0 is the return value,
-; so must be copied to another register. r1-r3 are voluntarily preserved by the
-; callee, so can be left in those registers.
-define dso_local i32 @_Z25test_non_exceptional_pathv() minsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-; CHECK:      @APP
-; CHECK-NEXT: @ def r0-r3
-; CHECK-NEXT: @NO_APP
-; CHECK-NEXT: mov     [[SAVE_R0:r[0-9]+]], r0
-; CHECK-NEXT: .Ltmp{{.*}}
-; CHECK-NEXT: bl      _Z11maybe_throwv
-; CHECK:      mov     r0, [[SAVE_R0]]
-; CHECK-NEXT: @APP
-; CHECK-NEXT: @ use r0-r3
-; CHECK-NEXT: @NO_APP
-entry:
-  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "// def r0-r3", "={r0},={r1},={r2},={r3}"()
-  %call = invoke i32 @_Z11maybe_throwv()
-          to label %try.cont unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (i8** @_ZTIi to i8*)
-  %2 = extractvalue { i8*, i32 } %1, 1
-  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
-  %matches = icmp eq i32 %2, %3
-  br i1 %matches, label %catch, label %ehcleanup
-
-catch:                                            ; preds = %lpad
-  %4 = extractvalue { i8*, i32 } %1, 0
-  %5 = tail call i8* @__cxa_begin_catch(i8* %4)
-  %6 = bitcast i8* %5 to i32*
-  %7 = load i32, i32* %6, align 4
-  tail call void @__cxa_end_catch()
-  br label %cleanup
-
-try.cont:                                         ; preds = %entry
-  %asmresult3 = extractvalue { i32, i32, i32, i32 } %0, 3
-  %asmresult2 = extractvalue { i32, i32, i32, i32 } %0, 2
-  %asmresult1 = extractvalue { i32, i32, i32, i32 } %0, 1
-  %asmresult = extractvalue { i32, i32, i32, i32 } %0, 0
-  tail call void asm sideeffect "// use r0-r3", "{r0},{r1},{r2},{r3}"(i32 %asmresult, i32 %asmresult1, i32 %asmresult2, i32 %asmresult3)
-  br label %cleanup
-
-cleanup:                                          ; preds = %try.cont, %catch
-  %retval.0 = phi i32 [ 0, %try.cont ], [ %7, %catch ]
-  ret i32 %retval.0
-
-ehcleanup:                                        ; preds = %lpad
-  resume { i8*, i32 } %1
-}
-
-
-; Use inline assembly to force r0-r3 to be alive across a potentially throwing
-; call, using them after catching the exception. IPRA does not currently mark
-; voluntarily preserved registers as live into the landing pad block, so all
-; four registers must be copied elsewhere.
-define dso_local i32 @_Z21test_exceptional_pathv() local_unnamed_addr minsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-; CHECK:      @APP
-; CHECK-NEXT: @ def r0-r3
-; CHECK-NEXT: @NO_APP
-; CHECK-DAG: mov [[SAVE_R0:r[0-9]+]], r0
-; CHECK-DAG: mov [[SAVE_R1:r[0-9]+]], r1
-; CHECK-DAG: mov [[SAVE_R2:r[0-9]+]], r2
-; CHECK-DAG: mov [[SAVE_R3:r[0-9]+]], r3
-; CHECK:      bl      _Z11maybe_throw
-
-; CHECK:      bl      __cxa_begin_catch
-; CHECK:      mov     r0, [[SAVE_R0]]
-; CHECK-NEXT: mov     r1, [[SAVE_R1]]
-; CHECK-NEXT: mov     r2, [[SAVE_R2]]
-; CHECK-NEXT: mov     r3, [[SAVE_R3]]
-; CHECK-NEXT: @APP
-; CHECK-NEXT: @ use r0-r3
-; CHECK-NEXT: @NO_APP
-entry:
-  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "// def r0-r3", "={r0},={r1},={r2},={r3}"()
-  %asmresult = extractvalue { i32, i32, i32, i32 } %0, 0
-  %asmresult1 = extractvalue { i32, i32, i32, i32 } %0, 1
-  %asmresult2 = extractvalue { i32, i32, i32, i32 } %0, 2
-  %asmresult3 = extractvalue { i32, i32, i32, i32 } %0, 3
-  %call = invoke i32 @_Z11maybe_throwv()
-          to label %cleanup unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (i8** @_ZTIi to i8*)
-  %2 = extractvalue { i8*, i32 } %1, 1
-  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
-  %matches = icmp eq i32 %2, %3
-  br i1 %matches, label %catch, label %ehcleanup
-
-catch:                                            ; preds = %lpad
-  %4 = extractvalue { i8*, i32 } %1, 0
-  %5 = tail call i8* @__cxa_begin_catch(i8* %4)
-  %6 = bitcast i8* %5 to i32*
-  %7 = load i32, i32* %6, align 4
-  tail call void asm sideeffect "// use r0-r3", "{r0},{r1},{r2},{r3}"(i32 %asmresult, i32 %asmresult1, i32 %asmresult2, i32 %asmresult3)
-  tail call void @__cxa_end_catch()
-  br label %cleanup
-
-cleanup:                                          ; preds = %entry, %catch
-  %retval.0 = phi i32 [ %7, %catch ], [ 0, %entry ]
-  ret i32 %retval.0
-
-ehcleanup:                                        ; preds = %lpad
-  resume { i8*, i32 } %1
-}

diff  --git a/llvm/test/CodeGen/ARM/ipra-extra-spills.ll b/llvm/test/CodeGen/ARM/ipra-extra-spills.ll
deleted file mode 100644
index ad9f30abdbbd..000000000000
--- a/llvm/test/CodeGen/ARM/ipra-extra-spills.ll
+++ /dev/null
@@ -1,406 +0,0 @@
-; RUN: llc -mtriple armv7a--none-eabi   -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
-; RUN: llc -mtriple thumbv7a--none-eabi -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB2
-; RUN: llc -mtriple thumbv6m--none-eabi -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1
-
-; This clobbers r0, and already needs a push/pop, so we also save and restore
-; r0. The push of r11 is to maintain stack alignment (though that isn't
-; technically needed in this example).
-define void @test_r0_r4() minsize nounwind {
-; CHECK-LABEL: test_r0_r4:
-; ARM: .save   {r0, r4, r11, lr}
-; ARM: push    {r0, r4, r11, lr}
-; ARM: pop     {r0, r4, r11, pc}
-; THUMB1: .save   {r0, r4, r7, lr}
-; THUMB1: push    {r0, r4, r7, lr}
-; THUMB1: pop     {r0, r4, r7, pc}
-; THUMB2: .save   {r0, r4, r7, lr}
-; THUMB2: push    {r0, r4, r7, lr}
-; THUMB2: pop     {r0, r4, r7, pc}
-  call void asm sideeffect "", "~{r0},~{r4}"()
-  ret void
-}
-
-; This clobbers r0-r3, and already needs a push/pop, so we also save and
-; restore all of them.
-define void @test_r0_r1_r2_r3_r4() minsize nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3_r4:
-; CHECK: .save   {r0, r1, r2, r3, r4, lr}
-; CHECK: push    {r0, r1, r2, r3, r4, lr}
-; CHECK: pop     {r0, r1, r2, r3, r4, pc}
-  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
-  ret void
-}
-
-; Check that IPRA does make use of the extra saved registers.
-define void @test_ipra() nounwind {
-; CHECK-LABEL: test_ipra:
-; CHECK: ASM1: r0, r1, r2, r3
-; CHECK-NOT: r0
-; CHECK-NOT: r1
-; CHECK-NOT: r2
-; CHECK-NOT: r3
-; CHECK: bl      test_r0_r1_r2_r3_r4
-; CHECK-NOT: r0
-; CHECK-NOT: r1
-; CHECK-NOT: r2
-; CHECK-NOT: r3
-; CHECK: ASM2: r0, r1, r2, r3
-  %regs = call { i32, i32, i32, i32 } asm sideeffect "// ASM1: $0, $1, $2, $3", "={r0},={r1},={r2},={r3}"() 
-  %r0 = extractvalue { i32, i32, i32, i32 } %regs, 0
-  %r1 = extractvalue { i32, i32, i32, i32 } %regs, 1
-  %r2 = extractvalue { i32, i32, i32, i32 } %regs, 2
-  %r3 = extractvalue { i32, i32, i32, i32 } %regs, 3
-  call void @test_r0_r1_r2_r3_r4()
-  call void asm sideeffect "// ASM2: $0, $1, $2, $3", "{r0},{r1},{r2},{r3}"(i32 %r0, i32 %r1, i32 %r2, i32 %r3)
-  ret void
-}
-
-; This clobbers r0-r3, but doesn't otherwise need a push/pop, so we don't add
-; them.
-define void @test_r0_r1_r2_r3() minsize nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3:
-; CHECK-NOT: push
-; CHECK-NOT: pop
-  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
-  ret void
-}
-
-; This isn't called in this function, so we don't push any extra registers.
-define void @test_r0_r4_not_called() minsize nounwind {
-; CHECK-LABEL: test_r0_r4_not_called:
-; CHECK: .save   {r4, lr}
-; CHECK: push    {r4, lr}
-; CHECK: pop     {r4, pc}
-; CHECK-NOT: push
-; CHECK-NOT: pop
-  call void asm sideeffect "", "~{r0},~{r4}"()
-  ret void
-}
-
-; This function is only optsize, not minsize, so we don't add any extra saves.
-define void @test_r0_r4_not_minsize() optsize nounwind {
-; CHECK-LABEL: test_r0_r4_not_minsize:
-; CHECK: .save   {r4, lr}
-; CHECK: push    {r4, lr}
-; CHECK: pop     {r4, pc}
-; CHECK-NOT: push
-; CHECK-NOT: pop
-  call void asm sideeffect "", "~{r0},~{r4}"()
-  ret void
-}
-
-; This function is not an exact definition (the linker could pick an
-; alternative version of it), so we don't add any extra saves.
-define linkonce_odr void @test_r0_r4_not_exact() minsize nounwind {
-; CHECK-LABEL: test_r0_r4_not_exact:
-; CHECK: .save   {r4, lr}
-; CHECK: push    {r4, lr}
-; CHECK: pop     {r4, pc}
-; CHECK-NOT: push
-; CHECK-NOT: pop
-  call void asm sideeffect "", "~{r0},~{r4}"()
-  ret void
-}
-
-; This clobbers r0-r3, but returns a value in r0, so only r1-r3 are saved.
-define i32 @test_r0_r1_r2_r3_r4_return_1() minsize nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_1:
-; ARM: .save   {r1, r2, r3, r4, r11, lr}
-; ARM: push    {r1, r2, r3, r4, r11, lr}
-; ARM: pop     {r1, r2, r3, r4, r11, pc}
-; THUMB1: .save   {r1, r2, r3, r4, r7, lr}
-; THUMB1: push    {r1, r2, r3, r4, r7, lr}
-; THUMB1: pop     {r1, r2, r3, r4, r7, pc}
-; THUMB2: .save   {r1, r2, r3, r4, r7, lr}
-; THUMB2: push    {r1, r2, r3, r4, r7, lr}
-; THUMB2: pop     {r1, r2, r3, r4, r7, pc}
-  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
-  ret i32 42
-}
-
-; This clobbers r0-r3, but returns a value in r0 and r1, so only r2-r3 are
-; saved.
-define i64 @test_r0_r1_r2_r3_r4_return_2() minsize nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_2:
-; CHECK: .save   {r2, r3, r4, lr}
-; CHECK: push    {r2, r3, r4, lr}
-; CHECK: pop     {r2, r3, r4, pc}
-  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
-  ret i64 42
-}
-
-; This clobbers r0-r3, but returns a value in all of r0-r3, so none of them can
-; be saved.
-define i128 @test_r0_r1_r2_r3_r4_return_4() minsize nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_4:
-; CHECK: .save   {r4, lr}
-; CHECK: push    {r4, lr}
-; CHECK: pop     {r4, pc}
-  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
-  ret i128 42
-}
-
-; This clobbers r0-r3, and returns a value in s0, so all of r0-r3 are saved (we
-; previously only checked the number of return registers, ignoring their
-; class).
-define arm_aapcs_vfpcc float @test_r0_r1_r2_r3_r4_return_float() minsize nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_float:
-; ARM: .save   {r0, r1, r2, r3, r4, lr}
-; ARM: push    {r0, r1, r2, r3, r4, lr}
-; ARM: pop     {r0, r1, r2, r3, r4, pc}
-; THUMB1: .save   {r1, r2, r3, r4, r7, lr}
-; THUMB1: push    {r1, r2, r3, r4, r7, lr}
-; THUMB1: pop     {r1, r2, r3, r4, r7, pc}
-; THUMB2: .save   {r0, r1, r2, r3, r4, lr}
-; THUMB2: push    {r0, r1, r2, r3, r4, lr}
-; THUMB2: pop     {r0, r1, r2, r3, r4, pc}
-  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
-  ret float 42.0
-}
-
-; Saving of high registers in thumb1 is more complicated, because they need to
-; be copied down to low registers to use push/pop instructions. Luckily, the
-; extra registers we are preserving are low registers, which are handled by the
-; outer-most push/pop pair, so this doesn't interact badly.
-define void @test_save_high_regs() minsize nounwind {
-; CHECK-LABEL: test_save_high_regs:
-; ARM: .save   {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
-; ARM: push    {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
-; ARM: pop     {r0, r1, r2, r3, r7, r8, r9, r10, r11, pc}
-; THUMB1:      .save   {r0, r1, r2, r3, r7, lr}
-; THUMB1-NEXT: push    {r0, r1, r2, r3, r7, lr}
-; THUMB1-NEXT: mov     lr, r11
-; THUMB1-NEXT: mov     r7, r10
-; THUMB1-NEXT: mov     r3, r9
-; THUMB1-NEXT: mov     r2, r8
-; THUMB1-NEXT: .save   {r8, r9, r10, r11}
-; THUMB1-NEXT: push    {r2, r3, r7, lr}
-; THUMB1:      pop     {r0, r1, r2, r3}
-; THUMB1-NEXT: mov     r8, r0
-; THUMB1-NEXT: mov     r9, r1
-; THUMB1-NEXT: mov     r10, r2
-; THUMB1-NEXT: mov     r11, r3
-; THUMB1-NEXT: pop     {r0, r1, r2, r3, r7, pc}
-; THUMB2: .save   {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
-; THUMB2: push.w  {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
-; THUMB2: pop.w   {r0, r1, r2, r3, r7, r8, r9, r10, r11, pc}
-  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r8},~{r9},~{r10},~{r11}"()
-  ret void
-}
-
-; We can also use extra registers in the PUSH/POP instructions to move the SP
-; to make space for local variables. These registers aren't preserved, because
-; the space they are saved in is used for the local variable. We try to back
-; off the extra-CSRs optimisation to allow this to still happen. In this case,
-; there are 8 bytes of stack space needed, so we preserve two argument
-; registers and use the other two for the SP update.
-define void @test_r0_r1_r2_r3_r4_stack8() minsize nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack8:
-; CHECK: .save   {r2, r3, r4, lr}
-; CHECK: push    {r0, r1, r2, r3, r4, lr}
-; CHECK: pop     {r0, r1, r2, r3, r4, pc}
-  %a = alloca [2 x i32], align 4
-  call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([2 x i32]* %a, i32 42)
-  ret void
-}
-
-; Check that, when the above function is called, r0 and r1 (used for the SP
-; updates) are considered clobbered, and r2 and r3 are preserved.
-define void @test_r0_r1_r2_r3_r4_stack8_caller() nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack8_caller:
-; CHECK:      ASM1: r0, r1, r2, r3
-; CHECK-NEXT: @NO_APP
-; CHECK-NEXT: mov     r4, r0
-; CHECK-NEXT: mov     r5, r1
-; CHECK-NEXT: bl      test_r0_r1_r2_r3_r4
-; CHECK-NEXT: mov     r0, r4
-; CHECK-NEXT: mov     r1, r5
-; CHECK-NEXT: @APP
-; CHECK-NEXT: ASM2: r0, r1, r2, r3
-  %regs = call { i32, i32, i32, i32 } asm sideeffect "// ASM1: $0, $1, $2, $3", "={r0},={r1},={r2},={r3}"() 
-  %r0 = extractvalue { i32, i32, i32, i32 } %regs, 0
-  %r1 = extractvalue { i32, i32, i32, i32 } %regs, 1
-  %r2 = extractvalue { i32, i32, i32, i32 } %regs, 2
-  %r3 = extractvalue { i32, i32, i32, i32 } %regs, 3
-  call void @test_r0_r1_r2_r3_r4_stack8()
-  call void asm sideeffect "// ASM2: $0, $1, $2, $3", "{r0},{r1},{r2},{r3}"(i32 %r0, i32 %r1, i32 %r2, i32 %r3)
-  ret void
-}
-
-; Like @test_r0_r1_r2_r3_r4_stack8, but 16 bytes of stack space are needed, so
-; all of r0-r3 are used for the SP update, and not preserved.
-define void @test_r0_r1_r2_r3_r4_stack16() minsize nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack16:
-; CHECK: .save   {r4, lr}
-; CHECK: push    {r0, r1, r2, r3, r4, lr}
-; CHECK: pop     {r0, r1, r2, r3, r4, pc}
-  %a = alloca [4 x i32], align 4
-  call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([4 x i32]* %a, i32 42)
-  ret void
-}
-
-; If more than 16 bytes of stack space are needed, it's unlikely that the
-; SP-update folding optimisation will succeed, so we revert back to preserving
-; r0-r3 for use in our callers.
-define void @test_r0_r1_r2_r3_r4_stack24() minsize nounwind {
-; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack24:
-; CHECK: .save   {r0, r1, r2, r3, r4, lr}
-; CHECK: push    {r0, r1, r2, r3, r4, lr}
-; CHECK: pop     {r0, r1, r2, r3, r4, pc}
-  %a = alloca [6 x i32], align 4
-  call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([6 x i32]* %a, i32 42)
-  ret void
-}
-
-define i32 @tail_callee(i32 %a, i32 %b) minsize nounwind {
-entry:
-  tail call void asm sideeffect "", "~{r2}"()
-  ret i32 %a
-}
-
-; The tail call happens outside the save/restore region, so prevents us from
-; preserving some registers. r0 and r1 are outgoing arguments to the tail-call,
-; so can't be preserved. r2 is modified inside the tail-called function, so
-; can't be presrved. r3 is known to be preserved by the callee, so can be
-; presrved. For Thumb1, we can't (efficiently) use a tail-call here, so r1-r3
-; are all preserved, with r0 being the return value.
-define i32 @test_tail_call() minsize nounwind {
-entry:
-; CHECK-LABEL: test_tail_call:
-; ARM: .save   {r3, lr}
-; ARM: push    {r3, lr}
-; ARM: pop     {r3, lr}
-; ARM: b       tail_callee
-; THUMB2: .save   {r3, lr}
-; THUMB2: push    {r3, lr}
-; THUMB2: pop.w   {r3, lr}
-; THUMB2: b       tail_callee
-; THUMB1: .save   {r1, r2, r3, lr}
-; THUMB1: push    {r1, r2, r3, lr}
-; THUMB1: bl      tail_callee
-; THUMB1: pop     {r1, r2, r3, pc}
-  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"()
-  %call = tail call i32 @tail_callee(i32 3, i32 4)
-  ret i32 %call
-}
-
-declare i32 @tail_callee_external(i32 %a, i32 %b)
-
-; If we tail-call an external function, it could clobber any of r0-r3.
-define i32 @test_tail_call_external() minsize nounwind {
-entry:
-; CHECK-LABEL: test_tail_call_external:
-; ARM: .save   {r11, lr}
-; ARM: push    {r11, lr}
-; ARM: pop     {r11, lr}
-; ARM: b       tail_callee_external
-; THUMB2: .save   {r7, lr}
-; THUMB2: push    {r7, lr}
-; THUMB2: pop.w   {r7, lr}
-; THUMB2: b       tail_callee_external
-; THUMB1: .save   {r1, r2, r3, lr}
-; THUMB1: push    {r1, r2, r3, lr}
-; THUMB1: bl      tail_callee_external
-; THUMB1: pop     {r1, r2, r3, pc}
-  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"()
-  %call = tail call i32 @tail_callee_external(i32 3, i32 4)
-  ret i32 %call
-}
-
-define linkonce_odr i32 @tail_callee_linkonce_odr(i32 %a, i32 %b) minsize nounwind {
-entry:
-  tail call void asm sideeffect "", "~{r2}"()
-  ret i32 %a
-}
-
-; If a tail-callee has an interposable linkage type (such as linkonce_odr), we
-; can't assume the linker will pick the definition we can see, so must assume
-; it clobbers all of r0-r3.
-define i32 @test_tail_call_linkonce_odr() minsize nounwind {
-entry:
-; CHECK-LABEL: test_tail_call_linkonce_odr:
-; ARM: .save   {r11, lr}
-; ARM: push    {r11, lr}
-; ARM: pop     {r11, lr}
-; ARM: b       tail_callee_linkonce_odr
-; THUMB2: .save   {r7, lr}
-; THUMB2: push    {r7, lr}
-; THUMB2: pop.w   {r7, lr}
-; THUMB2: b       tail_callee_linkonce_odr
-; THUMB1: .save   {r1, r2, r3, lr}
-; THUMB1: push    {r1, r2, r3, lr}
-; THUMB1: bl      tail_callee_linkonce_odr
-; THUMB1: pop     {r1, r2, r3, pc}
-  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"()
-  %call = tail call i32 @tail_callee_linkonce_odr(i32 3, i32 4)
-  ret i32 %call
-}
-
-; This function doesn't have the nounwind attribute, so unwind tables will be
-; emitted. Saving r0-r3 requires a longer unwind instruction sequence, which
-; results in an increase in total code size if there are few callers to make
-; use of the extra registers.
-define void @test_unwind_tables() minsize {
-; CHECK-LABEL: test_unwind_tables:
-; ARM: .save   {r4, lr}
-; ARM: push    {r4, lr}
-; ARM: pop     {r4, pc}
-; THUMB1: .save   {r4, lr}
-; THUMB1: push    {r4, lr}
-; THUMB1: pop     {r4, pc}
-; THUMB2: .save   {r4, lr}
-; THUMB2: push    {r4, lr}
-; THUMB2: pop     {r4, pc}
-  call void asm sideeffect "", "~{r0},~{r4}"()
-  ret void
-}
-
-; This requires an unwind table, but has many call sites, so overall we expect
-; the benefits to outweigh the size increase of the unwind table.
-define void @test_unwind_tables_many_calls() minsize {
-; CHECK-LABEL: test_unwind_tables_many_calls:
-; ARM: .save   {r0, r4, r11, lr}
-; ARM: push    {r0, r4, r11, lr}
-; ARM: pop     {r0, r4, r11, pc}
-; THUMB1: .save   {r0, r4, r7, lr}
-; THUMB1: push    {r0, r4, r7, lr}
-; THUMB1: pop     {r0, r4, r7, pc}
-; THUMB2: .save   {r0, r4, r7, lr}
-; THUMB2: push    {r0, r4, r7, lr}
-; THUMB2: pop     {r0, r4, r7, pc}
-  call void asm sideeffect "", "~{r0},~{r4}"()
-  ret void
-}
-
-; We don't do this optimisation is there are no callers in the same translation
-; unit (otherwise IPRA wouldn't be able to take advantage of the extra saved
-; registers), so most functions in this file are called here.
-define void @caller() {
-; CHECK-LABEL: caller:
-  call void @test_r0_r4()
-  call void @test_r0_r1_r2_r3_r4()
-  call void @test_r0_r1_r2_r3()
-  call void @test_r0_r4_not_minsize()
-  call void @test_r0_r4_not_exact()
-  %t1 = call i32 @test_r0_r1_r2_r3_r4_return_1()
-  %t2 = call i64 @test_r0_r1_r2_r3_r4_return_2()
-  %t3 = call i128 @test_r0_r1_r2_r3_r4_return_4()
-  %t4 = call float @test_r0_r1_r2_r3_r4_return_float()
-  call void @test_save_high_regs()
-  call void @test_r0_r1_r2_r3_r4_stack16()
-  call void @test_r0_r1_r2_r3_r4_stack24()
-  %t5 = call i32 @test_tail_call()
-  %t6 = call i32 @test_tail_call_external()
-  %t7 = call i32 @test_tail_call_linkonce_odr()
-  call void @test_unwind_tables()
-  call void @test_unwind_tables_many_calls()
-  call void @test_unwind_tables_many_calls()
-  call void @test_unwind_tables_many_calls()
-  call void @test_unwind_tables_many_calls()
-  call void @test_unwind_tables_many_calls()
-  call void @test_unwind_tables_many_calls()
-  call void @test_unwind_tables_many_calls()
-  call void @test_unwind_tables_many_calls()
-  call void @test_unwind_tables_many_calls()
-  ret void
-}

diff  --git a/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll b/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll
index 63726ce18df1..146a2223c357 100644
--- a/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll
+++ b/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll
@@ -66,13 +66,16 @@ return:                                           ; preds = %entry, %if.end
 define void @f3(i32 %x) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r0, #1
+; CHECK-NEXT:    bne .LBB2_2
+; CHECK-NEXT:  @ %bb.1: @ %t
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    cmp r0, #1
-; CHECK-NEXT:    itt eq
-; CHECK-NEXT:    moveq r0, #0
-; CHECK-NEXT:    bleq fn
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    bl fn
+; CHECK-NEXT:    pop.w {r7, lr}
+; CHECK-NEXT:  .LBB2_2: @ %f
+; CHECK-NEXT:    bx lr
 entry:
   %p = icmp eq i32 %x, 1
   br i1 %p, label %t, label %f