[llvm] 73cea83 - [IPRA][ARM] Spill extra registers at -Oz

Wed Mar 18 06:51:25 PDT 2020

Author: Oliver Stannard
Date: 2020-03-18T13:51:16Z
New Revision: 73cea83a6f5ab521edf3cccfc603534776d691ec

URL: https://github.com/llvm/llvm-project/commit/73cea83a6f5ab521edf3cccfc603534776d691ec
DIFF: https://github.com/llvm/llvm-project/commit/73cea83a6f5ab521edf3cccfc603534776d691ec.diff

LOG: [IPRA][ARM] Spill extra registers at -Oz

When optimising for code size at the expense of performance, it is often
worth saving and restoring some of r0-r3, if IPRA will be able to take
advantage of them. This doesn't cost any extra code size if we already
have a PUSH/POP pair, and increases the number of available registers
across any calls to the function.

We already have an optimisation which tries fold the subtract/add of the
SP into the PUSH/POP by using extra registers, which somewhat conflicts
with this. I've made the new optimisation less aggressive in cases where
the existing one is likely to trigger, which gives better results than
either of these optimisations by themselves.

Differential revision: https://reviews.llvm.org/D69936

Added: 
    llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll
    llvm/test/CodeGen/ARM/ipra-extra-spills.ll

Modified: 
    llvm/include/llvm/CodeGen/MachineRegisterInfo.h
    llvm/lib/CodeGen/MachineRegisterInfo.cpp
    llvm/lib/CodeGen/PrologEpilogInserter.cpp
    llvm/lib/Target/ARM/ARMFrameLowering.cpp
    llvm/lib/Target/ARM/ARMFrameLowering.h
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
    llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index c5a90b0c46e3..064b6075c095 100644

--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -91,6 +91,8 @@ class MachineRegisterInfo {
   /// all registers that were disabled are removed from the list.
   SmallVector<MCPhysReg, 16> UpdatedCSRs;
 
+  void initUpdatedCSRs();
+
   /// RegAllocHints - This vector records register allocation hints for
   /// virtual registers. For each virtual register, it keeps a pair of hint
   /// type and hints vector making up the allocation hints. Only the first
@@ -231,12 +233,17 @@ class MachineRegisterInfo {
 
   /// Disables the register from the list of CSRs.
   /// I.e. the register will not appear as part of the CSR mask.
-  /// \see UpdatedCalleeSavedRegs.
-  void disableCalleeSavedRegister(unsigned Reg);
+  /// \see UpdatedCSRs.
+  void disableCalleeSavedRegister(Register Reg);
+
+  /// Enables the register from the list of CSRs.
+  /// I.e. the register will appear as part of the CSR mask.
+  /// \see UpdatedCSRs.
+  void enableCalleeSavedRegister(Register Reg);
 
   /// Returns list of callee saved registers.
   /// The function returns the updated CSR list (after taking into account
-  /// registers that are disabled from the CSR list).
+  /// registers that are enabled/disabled from the CSR list).
   const MCPhysReg *getCalleeSavedRegs() const;
 
   /// Sets the updated Callee Saved Registers list.

diff  --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 270ba125df00..12f1bafe1fb2 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -610,30 +610,54 @@ bool MachineRegisterInfo::isPhysRegUsed(MCRegister PhysReg) const {
   return false;
 }
 
-void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) {
+void MachineRegisterInfo::initUpdatedCSRs() {
+  if (IsUpdatedCSRsInitialized)
+    return;
 
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
-  assert(Reg && (Reg < TRI->getNumRegs()) &&
-         "Trying to disable an invalid register");
+  const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
+  for (const MCPhysReg *I = CSR; *I; ++I)
+    UpdatedCSRs.push_back(*I);
 
-  if (!IsUpdatedCSRsInitialized) {
-    const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
-    for (const MCPhysReg *I = CSR; *I; ++I)
-      UpdatedCSRs.push_back(*I);
+  // Zero value represents the end of the register list
+  // (no more registers should be pushed).
+  UpdatedCSRs.push_back(0);
 
-    // Zero value represents the end of the register list
-    // (no more registers should be pushed).
-    UpdatedCSRs.push_back(0);
+  IsUpdatedCSRsInitialized = true;
+}
 
-    IsUpdatedCSRsInitialized = true;
-  }
+void MachineRegisterInfo::disableCalleeSavedRegister(Register Reg) {
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  assert(Reg && (Reg < TRI->getNumRegs()) &&
+         "Trying to disable an invalid register");
+
+  initUpdatedCSRs();
 
-  // Remove the register (and its aliases from the list).
+  // Remove the register (and its aliases) from the CSR list.
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
     UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI),
                       UpdatedCSRs.end());
 }
 
+void MachineRegisterInfo::enableCalleeSavedRegister(Register Reg) {
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  assert(Reg && (Reg < TRI->getNumRegs()) &&
+         "Trying to disable an invalid register");
+
+  initUpdatedCSRs();
+
+  // Remove the null terminator from the end of the list.
+  assert(UpdatedCSRs.back() == 0);
+  UpdatedCSRs.pop_back();
+
+  // Add the register (and its sub-registers) to the CSR list.
+  for (MCSubRegIterator SRI(Reg, TRI, true); SRI.isValid(); ++SRI)
+    UpdatedCSRs.push_back(*SRI);
+
+  // Put the null terminator back.
+  UpdatedCSRs.push_back(0);
+}
+
 const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const {
   if (IsUpdatedCSRsInitialized)
     return UpdatedCSRs.data();

diff  --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index d4c63bbdf46e..a2ca8bcf43ac 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -453,6 +453,8 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
         FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset);
       }
 
+      LLVM_DEBUG(dbgs() << "Assigned " << RegInfo->getName(Reg)
+                        << " to spill slot " << FrameIdx << "\n");
       CS.setFrameIdx(FrameIdx);
     }
   }

diff  --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 8910b66c235e..0822a1c06243 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -71,6 +71,14 @@ static cl::opt<bool>
 SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true),
                      cl::desc("Align ARM NEON spills in prolog and epilog"));
 
+static cl::opt<bool> EnableExtraSpills(
+    "arm-extra-spills", cl::Hidden, cl::init(false),
+    cl::desc("Preserve extra registers when useful for IPRA"));
+
+// Testing option to bypass some profitability checks.
+static cl::opt<bool> ForceExtraSpills("arm-extra-spills-force", cl::Hidden,
+                                      cl::init(false));
+
 static MachineBasicBlock::iterator
 skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
                         unsigned NumAlignedDPRCS2Regs);
@@ -1617,6 +1625,251 @@ checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
   SavedRegs.set(ARM::R4);
 }
 
+// Compute the set of registers which cannot be preserved, because they are
+// either modified outside the PUSH/POP instructions, or are live at the point
+// where the POP will be inserted. This only considers r0-r3, which are
+// currently the only registers we voluntatrily save when the PCS doesn't
+// require it.
+void ARMFrameLowering::findRegDefsOutsideSaveRestore(
+    MachineFunction &MF, BitVector &UnsaveableRegs) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  SmallSet<MachineBasicBlock *, 2> SaveBlocks;
+  SmallSet<MachineBasicBlock *, 2> RestoreBlocks;
+
+  if (MFI.getSavePoint()) {
+    SaveBlocks.insert(MFI.getSavePoint());
+    RestoreBlocks.insert(MFI.getRestorePoint());
+  } else {
+    SaveBlocks.insert(&MF.front());
+    for (MachineBasicBlock &MBB : MF)
+      if (MBB.isReturnBlock())
+        RestoreBlocks.insert(&MBB);
+  }
+
+  // Walk blocks from the function entry and exits (following control flow both
+  // ways), stopping when we get to a save/restore block. Check for
+  // instructions which modify any of the registers we care about.
+  SmallVector<MachineBasicBlock *, 4> WorkList;
+  SmallSet<MachineBasicBlock *, 4> VisitedBlocks;
+  LLVM_DEBUG(dbgs() << "Entry block: " << MF.front().getName() << "\n");
+  WorkList.push_back(&MF.front());
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.isReturnBlock()) {
+      LLVM_DEBUG(dbgs() << "Return block: " << MBB.getName() << "\n");
+      WorkList.push_back(&MBB);
+    }
+  }
+
+  auto CheckOutsideInst = [&UnsaveableRegs, TRI](MachineInstr &MI) {
+    for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
+      if (MI.modifiesRegister(Reg, TRI)) {
+        UnsaveableRegs.set(Reg);
+        LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg)
+                          << " modified by instruction " << MI << "\n");
+      }
+    }
+  };
+
+  while (!WorkList.empty()) {
+    MachineBasicBlock *MBB = WorkList.pop_back_val();
+
+    if (VisitedBlocks.count(MBB))
+      continue;
+    VisitedBlocks.insert(MBB);
+
+    bool IsSave = SaveBlocks.count(MBB);
+    bool IsRestore = RestoreBlocks.count(MBB);
+
+    LLVM_DEBUG(dbgs() << "Visiting block " << MBB->getName() << ", IsSave="
+                      << IsSave << ", IsRestore=" << IsRestore << "\n");
+
+    // If this is a restore block, the POP instruction will be inserted just
+    // before the terminator, so we need to consider any terminator
+    // instructions to be outside the preserved region. We also need to check
+    // for registers which are live at the POP insertion point, because these
+    // can't be restored without changing their value.
+    if (IsRestore) {
+      LivePhysRegs LPR(*TRI);
+      LPR.addLiveOuts(*MBB);
+      for (auto &Term : reverse(MBB->terminators())) {
+        LPR.stepBackward(Term);
+        CheckOutsideInst(Term);
+      }
+
+      for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
+        if (LPR.contains(Reg)) {
+          UnsaveableRegs.set(Reg);
+          LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg)
+                            << " live-out of restore block " << MBB->getName()
+                            << "\n");
+        }
+      }
+    }
+
+    // If this block is completely outside the save/restore region, then any
+    // modified registers can't be preserved. A save block counts as being
+    // inside the saved region, with the possible exception of the last few
+    // instructions if it's also a restore block, handled above. We don't visit
+    // blocks which are completely inside the saved region and don't have any
+    // save/restore instructions, so don't need to check that here.
+    if (!IsSave && !IsRestore)
+      for (auto &MI : *MBB)
+        CheckOutsideInst(MI);
+
+    // Walk the control flow graph in both directions, except for blocks which
+    // are inside the PUSH/POP region.
+    if (IsSave || !IsRestore)
+      for (auto Pred : MBB->predecessors())
+        WorkList.push_back(Pred);
+    if (!IsSave || IsRestore)
+      for (auto Succ : MBB->successors())
+        WorkList.push_back(Succ);
+  }
+}
+
+bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  // Shrink wrapping is detrimental to code size because it prevents merging
+  // the CSR restore and function return into one POP instruction. It also
+  // conflicts with saving extra registers for IPRA, because it makes more
+  // registers live at the PUSH/POP.
+  if (MF.getFunction().hasMinSize())
+    return false;
+
+  return true;
+}
+
+// When doing inter-procedural register allocation, saving extra registers in
+// [r0,r3] will allow us to keep live values in them in any callers. The extra
+// saves and restores don't cost us any code-size if we are already emitting
+// PUSH and POP instructions.
+unsigned ARMFrameLowering::spillExtraRegsForIPRA(MachineFunction &MF,
+                                                 BitVector &SavedRegs,
+                                                 bool HasFPRegSaves) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  LLVM_DEBUG(dbgs() << "Extra spills for " << MF.getName() << ": ");
+
+  if (!EnableExtraSpills) {
+    LLVM_DEBUG(dbgs() << "optimisation not enabled\n");
+    return 0;
+  }
+
+  // If IPRA is not enabled, nothing will be able to take advantage of the
+  // extra saved registers.
+  if (!MF.getTarget().Options.EnableIPRA) {
+    LLVM_DEBUG(dbgs() << "IPRA disabled\n");
+    return 0;
+  }
+
+  // These registers will take extra time to save and restore, and will often
+  // go unused, so only to this at -Oz.
+  if (!MF.getFunction().hasMinSize()) {
+    LLVM_DEBUG(dbgs() << "not minsize\n");
+    return 0;
+  }
+
+  // If we are not currently spilling any registers, we'd need to add an extra
+  // PUSH/POP pair, so this isn't worth it.
+  if (!SavedRegs.any()) {
+    LLVM_DEBUG(dbgs() << "no existing push/pop\n");
+    return 0;
+  }
+
+  // If we can't guarantee that this definition of the function is the one
+  // which will be picked by the linker, then IPRA can't make use of any extra
+  // saved registers.
+  if (!MF.getFunction().isDefinitionExact()) {
+    LLVM_DEBUG(dbgs() << "inexact definition\n");
+    return 0;
+  }
+
+  int NumVisibleCallers = 0;
+  for (const User *U : MF.getFunction().users()) {
+    if (const CallBase *Call = dyn_cast<CallBase>(U)) {
+      if (Call->getCalledOperand() == &MF.getFunction()) {
+        ++NumVisibleCallers;
+      }
+    }
+  }
+
+  // If we don't have any direct callers in the current translation unit,
+  // nothing will be able to take advantage of the extra saved registers.
+  if (NumVisibleCallers == 0 && !ForceExtraSpills) {
+    LLVM_DEBUG(dbgs() << "no visible callers\n");
+    return 0;
+  }
+
+  // If we need to emit unwind tables, these will be longer if we need to
+  // preserve r0-r3, so we need a lot of visible calls to make this worthwhile.
+  if (MF.getFunction().needsUnwindTableEntry() && NumVisibleCallers <= 8 &&
+      !ForceExtraSpills) {
+    LLVM_DEBUG(dbgs() << "needs unwind table\n");
+    return 0;
+  }
+
+  // Ok, we've decided we are going to try the optimisation.
+  LLVM_DEBUG(dbgs() << "enabled\n");
+
+  // Compute the registers which can't be preserved because they are either
+  // modified before the PUSH or after the POP, or are live at the point where
+  // the POP will be inserted.
+  BitVector NonPreserveableRegisters;
+  NonPreserveableRegisters.resize(TRI->getNumRegs());
+  findRegDefsOutsideSaveRestore(MF, NonPreserveableRegisters);
+
+  unsigned NumExtraRegs = 0;
+
+  // We'd also like to leave some registers free so that we can use them to
+  // fold a small SP update into the PUSH/POP. We can't know exactly what this
+  // optimisation can do, because stack layout isn't finalised, but we can make
+  // a good enough estimate.
+  unsigned StackSize = MFI.estimateStackSize(MF);
+
+  // If the stack space is large, we probably won't be able to fold the SP
+  // update into the push/pop, so we should use all the registers we want. If
+  // we have FP register saves, then the SP update will be folded into the
+  // VPUSH/VPOP instead, and we can use the GPRs freely.
+  if (StackSize > 16 || HasFPRegSaves)
+    StackSize = 0;
+
+  LLVM_DEBUG(dbgs() << "Estimated " << StackSize
+                    << " bytes of SP update being folded into push/pop\n");
+
+  for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
+    if (StackSize) {
+      StackSize -= 4;
+      LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg)
+                        << ", wanted for SP update\n");
+      continue;
+    }
+
+    // If we don't modify the register anywhere in this function, IPRA will
+    // already know that it is preserved, and there's no point in saving it.
+    if (!MRI.isPhysRegModified(Reg)) {
+      LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg)
+                        << ", not modified\n");
+      continue;
+    }
+
+    if (NonPreserveableRegisters[Reg]) {
+      LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg)
+                        << ", modified outide save region\n");
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "also saving " << TRI->getName(Reg) << " for IPRA\n");
+    SavedRegs.set(Reg);
+    MRI.enableCalleeSavedRegister(Reg);
+    ++NumExtraRegs;
+  }
+
+  return NumExtraRegs;
+}
+
 void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             BitVector &SavedRegs,
                                             RegScavenger *RS) const {
@@ -2007,6 +2260,14 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
                         << "\n");
     }
 
+    // When using IPRA, we might want to preserve some of r0-r3, to reduce
+    // register pressure in our callers.
+    unsigned ExtraIPRASpills =
+        spillExtraRegsForIPRA(MF, SavedRegs, NumFPRSpills != 0);
+    NumGPRSpills += ExtraIPRASpills;
+    if (ExtraIPRASpills)
+      CS1Spilled = true;
+
     // Avoid spilling LR in Thumb1 if there's a tail call: it's expensive to
     // restore LR in that case.
     bool ExpensiveLRRestore = AFI->isThumb1OnlyFunction() && MFI.hasTailCall();

diff  --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
index f30f3895d972..e03e2d0e1cdb 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -56,6 +56,10 @@ class ARMFrameLowering : public TargetFrameLowering {
 
   void getCalleeSaves(const MachineFunction &MF,
                       BitVector &SavedRegs) const override;
+  void findRegDefsOutsideSaveRestore(MachineFunction &MF,
+                                     BitVector &Regs) const;
+  unsigned spillExtraRegsForIPRA(MachineFunction &MF, BitVector &SavedRegs,
+                                 bool HasFPRegSaves) const;
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
 
@@ -63,9 +67,8 @@ class ARMFrameLowering : public TargetFrameLowering {
                                 MachineBasicBlock &MBB) const override;
 
   /// Returns true if the target will correctly handle shrink wrapping.
-  bool enableShrinkWrapping(const MachineFunction &MF) const override {
-    return true;
-  }
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
   bool isProfitableForNoCSROpt(const Function &F) const override {
     // The no-CSR optimisation is bad for code size on ARM, because we can save
     // many registers with a single PUSH/POP pair.

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a393c8fd9ed8..45a88a686dab 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2458,26 +2458,25 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  if (!isTailCall) {
-    const uint32_t *Mask;
-    const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
-    if (isThisReturn) {
-      // For 'this' returns, use the R0-preserving mask if applicable
-      Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
-      if (!Mask) {
-        // Set isThisReturn to false if the calling convention is not one that
-        // allows 'returned' to be modeled in this way, so LowerCallResult does
-        // not try to pass 'this' straight through
-        isThisReturn = false;
-        Mask = ARI->getCallPreservedMask(MF, CallConv);
-      }
-    } else
+  const uint32_t *Mask;
+  const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+  if (isThisReturn) {
+    // For 'this' returns, use the R0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
+    if (!Mask) {
+      // Set isThisReturn to false if the calling convention is not one that
+      // allows 'returned' to be modeled in this way, so LowerCallResult does
+      // not try to pass 'this' straight through
+      isThisReturn = false;
       Mask = ARI->getCallPreservedMask(MF, CallConv);
-
-    assert(Mask && "Missing call preserved mask for calling convention");
-    Ops.push_back(DAG.getRegisterMask(Mask));
+    }
+  } else {
+    Mask = ARI->getCallPreservedMask(MF, CallConv);
   }
 
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 

diff  --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index c017ebb885da..0e024437667e 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -216,6 +216,10 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
         break;
       }
       LLVM_FALLTHROUGH;
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
     case ARM::R4:
     case ARM::R5:
     case ARM::R6:
@@ -848,7 +852,8 @@ bool Thumb1FrameLowering::spillCalleeSavedRegisters(
   if (!LoRegsToSave.none()) {
     MachineInstrBuilder MIB =
         BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
-    for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) {
+    for (unsigned Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4, ARM::R5,
+                         ARM::R6, ARM::R7, ARM::LR}) {
       if (LoRegsToSave[Reg]) {
         bool isKill = !MRI.isLiveIn(Reg);
         if (isKill && !MRI.isReserved(Reg))
@@ -956,6 +961,9 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
       llvm_unreachable("callee-saved register of unexpected class");
     }
 
+    if (Reg == ARM::LR)
+      I.setRestored(false);
+
     // If this is a low register not used as the frame pointer, we may want to
     // use it for restoring the high registers.
     if ((ARM::tGPRRegClass.contains(Reg)) &&
@@ -980,6 +988,9 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
   static const unsigned AllCopyRegs[] = {ARM::R0, ARM::R1, ARM::R2, ARM::R3,
                                          ARM::R4, ARM::R5, ARM::R6, ARM::R7};
   static const unsigned AllHighRegs[] = {ARM::R8, ARM::R9, ARM::R10, ARM::R11};
+  static const unsigned AllLoRegs[] = {ARM::R0, ARM::R1, ARM::R2,
+                                       ARM::R3, ARM::R4, ARM::R5,
+                                       ARM::R6, ARM::R7, ARM::LR};
 
   const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs);
   const unsigned *AllHighRegsEnd = std::end(AllHighRegs);
@@ -1018,16 +1029,10 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
       BuildMI(MF, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
 
   bool NeedsPop = false;
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    CalleeSavedInfo &Info = CSI[i-1];
-    unsigned Reg = Info.getReg();
-
-    // High registers (excluding lr) have already been dealt with
-    if (!(ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR))
+  for (unsigned Reg : AllLoRegs) {
+    if (!LoRegsToRestore[Reg])
       continue;
-
     if (Reg == ARM::LR) {
-      Info.setRestored(false);
       if (!MBB.succ_empty() ||
           MI->getOpcode() == ARM::TCRETURNdi ||
           MI->getOpcode() == ARM::TCRETURNri)

diff  --git a/llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll b/llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll
new file mode 100644
index 000000000000..ead0278cb362
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll
@@ -0,0 +1,149 @@
+; RUN: llc -mtriple armv7a--none-eabi   -enable-ipra=true -arm-extra-spills -arm-extra-spills-force -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple thumbv7a--none-eabi -enable-ipra=true -arm-extra-spills -arm-extra-spills-force -verify-machineinstrs < %s | FileCheck %s
+
+; Test the interaction between IPRA and C++ exception handling. Currently, IPRA
+; only marks registers as preserved on the non-exceptional return path, not in
+; the landing pad.
+
+declare dso_local i8* @__cxa_allocate_exception(i32) local_unnamed_addr
+declare dso_local void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
+declare dso_local i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+declare dso_local i8* @__cxa_begin_catch(i8*) local_unnamed_addr
+declare dso_local void @__cxa_end_catch() local_unnamed_addr
+
+ at g = dso_local local_unnamed_addr global i32 0, align 4
+ at _ZTIi = external dso_local constant i8*
+
+define dso_local i32 @_Z11maybe_throwv() minsize {
+; This function might return normally, or might throw an exception. r0 is used
+; for a return value, we can preserve r1-r3 for IPRA.
+; CHECK:      .save   {r1, r2, r3, lr}
+; CHECK-NEXT: push    {r1, r2, r3, lr}
+; CHECK:      pop{{(..)?}}    {r1, r2, r3, pc}
+entry:
+  %0 = load i32, i32* @g, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %exception = tail call i8* @__cxa_allocate_exception(i32 4)
+  %1 = bitcast i8* %exception to i32*
+  store i32 42, i32* %1, align 8
+  tail call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+
+if.else:                                          ; preds = %entry
+  ret i32 1337
+}
+
+; Use inline assembly to force r0-r3 to be alive across a potentially throwing
+; call, using them on the non-exceptional return path. r0 is the return value,
+; so must be copied to another register. r1-r3 are voluntarily preserved by the
+; callee, so can be left in those registers.
+define dso_local i32 @_Z25test_non_exceptional_pathv() minsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK:      @APP
+; CHECK-NEXT: @ def r0-r3
+; CHECK-NEXT: @NO_APP
+; CHECK-NEXT: mov     [[SAVE_R0:r[0-9]+]], r0
+; CHECK-NEXT: .Ltmp{{.*}}
+; CHECK-NEXT: bl      _Z11maybe_throwv
+; CHECK:      mov     r0, [[SAVE_R0]]
+; CHECK-NEXT: @APP
+; CHECK-NEXT: @ use r0-r3
+; CHECK-NEXT: @NO_APP
+entry:
+  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "// def r0-r3", "={r0},={r1},={r2},={r3}"()
+  %call = invoke i32 @_Z11maybe_throwv()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 }
+          cleanup
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %2 = extractvalue { i8*, i32 } %1, 1
+  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %2, %3
+  br i1 %matches, label %catch, label %ehcleanup
+
+catch:                                            ; preds = %lpad
+  %4 = extractvalue { i8*, i32 } %1, 0
+  %5 = tail call i8* @__cxa_begin_catch(i8* %4)
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  tail call void @__cxa_end_catch()
+  br label %cleanup
+
+try.cont:                                         ; preds = %entry
+  %asmresult3 = extractvalue { i32, i32, i32, i32 } %0, 3
+  %asmresult2 = extractvalue { i32, i32, i32, i32 } %0, 2
+  %asmresult1 = extractvalue { i32, i32, i32, i32 } %0, 1
+  %asmresult = extractvalue { i32, i32, i32, i32 } %0, 0
+  tail call void asm sideeffect "// use r0-r3", "{r0},{r1},{r2},{r3}"(i32 %asmresult, i32 %asmresult1, i32 %asmresult2, i32 %asmresult3)
+  br label %cleanup
+
+cleanup:                                          ; preds = %try.cont, %catch
+  %retval.0 = phi i32 [ 0, %try.cont ], [ %7, %catch ]
+  ret i32 %retval.0
+
+ehcleanup:                                        ; preds = %lpad
+  resume { i8*, i32 } %1
+}
+
+
+; Use inline assembly to force r0-r3 to be alive across a potentially throwing
+; call, using them after catching the exception. IPRA does not currently mark
+; voluntarily preserved registers as live into the landing pad block, so all
+; four registers must be copied elsewhere.
+define dso_local i32 @_Z21test_exceptional_pathv() local_unnamed_addr minsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK:      @APP
+; CHECK-NEXT: @ def r0-r3
+; CHECK-NEXT: @NO_APP
+; CHECK-DAG: mov [[SAVE_R0:r[0-9]+]], r0
+; CHECK-DAG: mov [[SAVE_R1:r[0-9]+]], r1
+; CHECK-DAG: mov [[SAVE_R2:r[0-9]+]], r2
+; CHECK-DAG: mov [[SAVE_R3:r[0-9]+]], r3
+; CHECK:      bl      _Z11maybe_throw
+
+; CHECK:      bl      __cxa_begin_catch
+; CHECK:      mov     r0, [[SAVE_R0]]
+; CHECK-NEXT: mov     r1, [[SAVE_R1]]
+; CHECK-NEXT: mov     r2, [[SAVE_R2]]
+; CHECK-NEXT: mov     r3, [[SAVE_R3]]
+; CHECK-NEXT: @APP
+; CHECK-NEXT: @ use r0-r3
+; CHECK-NEXT: @NO_APP
+entry:
+  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "// def r0-r3", "={r0},={r1},={r2},={r3}"()
+  %asmresult = extractvalue { i32, i32, i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32 } %0, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32 } %0, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32 } %0, 3
+  %call = invoke i32 @_Z11maybe_throwv()
+          to label %cleanup unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 }
+          cleanup
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %2 = extractvalue { i8*, i32 } %1, 1
+  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %2, %3
+  br i1 %matches, label %catch, label %ehcleanup
+
+catch:                                            ; preds = %lpad
+  %4 = extractvalue { i8*, i32 } %1, 0
+  %5 = tail call i8* @__cxa_begin_catch(i8* %4)
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  tail call void asm sideeffect "// use r0-r3", "{r0},{r1},{r2},{r3}"(i32 %asmresult, i32 %asmresult1, i32 %asmresult2, i32 %asmresult3)
+  tail call void @__cxa_end_catch()
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %catch
+  %retval.0 = phi i32 [ %7, %catch ], [ 0, %entry ]
+  ret i32 %retval.0
+
+ehcleanup:                                        ; preds = %lpad
+  resume { i8*, i32 } %1
+}

diff  --git a/llvm/test/CodeGen/ARM/ipra-extra-spills.ll b/llvm/test/CodeGen/ARM/ipra-extra-spills.ll
new file mode 100644
index 000000000000..ad9f30abdbbd
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ipra-extra-spills.ll
@@ -0,0 +1,406 @@
+; RUN: llc -mtriple armv7a--none-eabi   -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -mtriple thumbv7a--none-eabi -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB2
+; RUN: llc -mtriple thumbv6m--none-eabi -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1
+
+; This clobbers r0, and already needs a push/pop, so we also save and restore
+; r0. The push of r11 is to maintain stack alignment (though that isn't
+; technically needed in this example).
+define void @test_r0_r4() minsize nounwind {
+; CHECK-LABEL: test_r0_r4:
+; ARM: .save   {r0, r4, r11, lr}
+; ARM: push    {r0, r4, r11, lr}
+; ARM: pop     {r0, r4, r11, pc}
+; THUMB1: .save   {r0, r4, r7, lr}
+; THUMB1: push    {r0, r4, r7, lr}
+; THUMB1: pop     {r0, r4, r7, pc}
+; THUMB2: .save   {r0, r4, r7, lr}
+; THUMB2: push    {r0, r4, r7, lr}
+; THUMB2: pop     {r0, r4, r7, pc}
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This clobbers r0-r3, and already needs a push/pop, so we also save and
+; restore all of them.
+define void @test_r0_r1_r2_r3_r4() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4:
+; CHECK: .save   {r0, r1, r2, r3, r4, lr}
+; CHECK: push    {r0, r1, r2, r3, r4, lr}
+; CHECK: pop     {r0, r1, r2, r3, r4, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret void
+}
+
+; Check that IPRA does make use of the extra saved registers.
+define void @test_ipra() nounwind {
+; CHECK-LABEL: test_ipra:
+; CHECK: ASM1: r0, r1, r2, r3
+; CHECK-NOT: r0
+; CHECK-NOT: r1
+; CHECK-NOT: r2
+; CHECK-NOT: r3
+; CHECK: bl      test_r0_r1_r2_r3_r4
+; CHECK-NOT: r0
+; CHECK-NOT: r1
+; CHECK-NOT: r2
+; CHECK-NOT: r3
+; CHECK: ASM2: r0, r1, r2, r3
+  %regs = call { i32, i32, i32, i32 } asm sideeffect "// ASM1: $0, $1, $2, $3", "={r0},={r1},={r2},={r3}"() 
+  %r0 = extractvalue { i32, i32, i32, i32 } %regs, 0
+  %r1 = extractvalue { i32, i32, i32, i32 } %regs, 1
+  %r2 = extractvalue { i32, i32, i32, i32 } %regs, 2
+  %r3 = extractvalue { i32, i32, i32, i32 } %regs, 3
+  call void @test_r0_r1_r2_r3_r4()
+  call void asm sideeffect "// ASM2: $0, $1, $2, $3", "{r0},{r1},{r2},{r3}"(i32 %r0, i32 %r1, i32 %r2, i32 %r3)
+  ret void
+}
+
+; This clobbers r0-r3, but doesn't otherwise need a push/pop, so we don't add
+; them.
+define void @test_r0_r1_r2_r3() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3:
+; CHECK-NOT: push
+; CHECK-NOT: pop
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+  ret void
+}
+
+; This isn't called in this function, so we don't push any extra registers.
+define void @test_r0_r4_not_called() minsize nounwind {
+; CHECK-LABEL: test_r0_r4_not_called:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r4, lr}
+; CHECK: pop     {r4, pc}
+; CHECK-NOT: push
+; CHECK-NOT: pop
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This function is only optsize, not minsize, so we don't add any extra saves.
+define void @test_r0_r4_not_minsize() optsize nounwind {
+; CHECK-LABEL: test_r0_r4_not_minsize:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r4, lr}
+; CHECK: pop     {r4, pc}
+; CHECK-NOT: push
+; CHECK-NOT: pop
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This function is not an exact definition (the linker could pick an
+; alternative version of it), so we don't add any extra saves.
+define linkonce_odr void @test_r0_r4_not_exact() minsize nounwind {
+; CHECK-LABEL: test_r0_r4_not_exact:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r4, lr}
+; CHECK: pop     {r4, pc}
+; CHECK-NOT: push
+; CHECK-NOT: pop
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This clobbers r0-r3, but returns a value in r0, so only r1-r3 are saved.
+define i32 @test_r0_r1_r2_r3_r4_return_1() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_1:
+; ARM: .save   {r1, r2, r3, r4, r11, lr}
+; ARM: push    {r1, r2, r3, r4, r11, lr}
+; ARM: pop     {r1, r2, r3, r4, r11, pc}
+; THUMB1: .save   {r1, r2, r3, r4, r7, lr}
+; THUMB1: push    {r1, r2, r3, r4, r7, lr}
+; THUMB1: pop     {r1, r2, r3, r4, r7, pc}
+; THUMB2: .save   {r1, r2, r3, r4, r7, lr}
+; THUMB2: push    {r1, r2, r3, r4, r7, lr}
+; THUMB2: pop     {r1, r2, r3, r4, r7, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret i32 42
+}
+
+; This clobbers r0-r3, but returns a value in r0 and r1, so only r2-r3 are
+; saved.
+define i64 @test_r0_r1_r2_r3_r4_return_2() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_2:
+; CHECK: .save   {r2, r3, r4, lr}
+; CHECK: push    {r2, r3, r4, lr}
+; CHECK: pop     {r2, r3, r4, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret i64 42
+}
+
+; This clobbers r0-r3, but returns a value in all of r0-r3, so none of them can
+; be saved.
+define i128 @test_r0_r1_r2_r3_r4_return_4() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_4:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r4, lr}
+; CHECK: pop     {r4, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret i128 42
+}
+
+; This clobbers r0-r3, and returns a value in s0, so all of r0-r3 are saved (we
+; previously only checked the number of return registers, ignoring their
+; class).
+define arm_aapcs_vfpcc float @test_r0_r1_r2_r3_r4_return_float() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_float:
+; ARM: .save   {r0, r1, r2, r3, r4, lr}
+; ARM: push    {r0, r1, r2, r3, r4, lr}
+; ARM: pop     {r0, r1, r2, r3, r4, pc}
+; THUMB1: .save   {r1, r2, r3, r4, r7, lr}
+; THUMB1: push    {r1, r2, r3, r4, r7, lr}
+; THUMB1: pop     {r1, r2, r3, r4, r7, pc}
+; THUMB2: .save   {r0, r1, r2, r3, r4, lr}
+; THUMB2: push    {r0, r1, r2, r3, r4, lr}
+; THUMB2: pop     {r0, r1, r2, r3, r4, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret float 42.0
+}
+
+; Saving of high registers in thumb1 is more complicated, because they need to
+; be copied down to low registers to use push/pop instructions. Luckily, the
+; extra registers we are preserving are low registers, which are handled by the
+; outer-most push/pop pair, so this doesn't interact badly.
+define void @test_save_high_regs() minsize nounwind {
+; CHECK-LABEL: test_save_high_regs:
+; ARM: .save   {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
+; ARM: push    {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
+; ARM: pop     {r0, r1, r2, r3, r7, r8, r9, r10, r11, pc}
+; THUMB1:      .save   {r0, r1, r2, r3, r7, lr}
+; THUMB1-NEXT: push    {r0, r1, r2, r3, r7, lr}
+; THUMB1-NEXT: mov     lr, r11
+; THUMB1-NEXT: mov     r7, r10
+; THUMB1-NEXT: mov     r3, r9
+; THUMB1-NEXT: mov     r2, r8
+; THUMB1-NEXT: .save   {r8, r9, r10, r11}
+; THUMB1-NEXT: push    {r2, r3, r7, lr}
+; THUMB1:      pop     {r0, r1, r2, r3}
+; THUMB1-NEXT: mov     r8, r0
+; THUMB1-NEXT: mov     r9, r1
+; THUMB1-NEXT: mov     r10, r2
+; THUMB1-NEXT: mov     r11, r3
+; THUMB1-NEXT: pop     {r0, r1, r2, r3, r7, pc}
+; THUMB2: .save   {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
+; THUMB2: push.w  {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
+; THUMB2: pop.w   {r0, r1, r2, r3, r7, r8, r9, r10, r11, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r8},~{r9},~{r10},~{r11}"()
+  ret void
+}
+
+; We can also use extra registers in the PUSH/POP instructions to move the SP
+; to make space for local variables. These registers aren't preserved, because
+; the space they are saved in is used for the local variable. We try to back
+; off the extra-CSRs optimisation to allow this to still happen. In this case,
+; there are 8 bytes of stack space needed, so we preserve two argument
+; registers and use the other two for the SP update.
+define void @test_r0_r1_r2_r3_r4_stack8() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack8:
+; CHECK: .save   {r2, r3, r4, lr}
+; CHECK: push    {r0, r1, r2, r3, r4, lr}
+; CHECK: pop     {r0, r1, r2, r3, r4, pc}
+  %a = alloca [2 x i32], align 4
+  call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([2 x i32]* %a, i32 42)
+  ret void
+}
+
+; Check that, when the above function is called, r0 and r1 (used for the SP
+; updates) are considered clobbered, and r2 and r3 are preserved.
+define void @test_r0_r1_r2_r3_r4_stack8_caller() nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack8_caller:
+; CHECK:      ASM1: r0, r1, r2, r3
+; CHECK-NEXT: @NO_APP
+; CHECK-NEXT: mov     r4, r0
+; CHECK-NEXT: mov     r5, r1
+; CHECK-NEXT: bl      test_r0_r1_r2_r3_r4
+; CHECK-NEXT: mov     r0, r4
+; CHECK-NEXT: mov     r1, r5
+; CHECK-NEXT: @APP
+; CHECK-NEXT: ASM2: r0, r1, r2, r3
+  %regs = call { i32, i32, i32, i32 } asm sideeffect "// ASM1: $0, $1, $2, $3", "={r0},={r1},={r2},={r3}"() 
+  %r0 = extractvalue { i32, i32, i32, i32 } %regs, 0
+  %r1 = extractvalue { i32, i32, i32, i32 } %regs, 1
+  %r2 = extractvalue { i32, i32, i32, i32 } %regs, 2
+  %r3 = extractvalue { i32, i32, i32, i32 } %regs, 3
+  call void @test_r0_r1_r2_r3_r4_stack8()
+  call void asm sideeffect "// ASM2: $0, $1, $2, $3", "{r0},{r1},{r2},{r3}"(i32 %r0, i32 %r1, i32 %r2, i32 %r3)
+  ret void
+}
+
+; Like @test_r0_r1_r2_r3_r4_stack8, but 16 bytes of stack space are needed, so
+; all of r0-r3 are used for the SP update, and not preserved.
+define void @test_r0_r1_r2_r3_r4_stack16() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack16:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r0, r1, r2, r3, r4, lr}
+; CHECK: pop     {r0, r1, r2, r3, r4, pc}
+  %a = alloca [4 x i32], align 4
+  call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([4 x i32]* %a, i32 42)
+  ret void
+}
+
+; If more than 16 bytes of stack space are needed, it's unlikely that the
+; SP-update folding optimisation will succeed, so we revert back to preserving
+; r0-r3 for use in our callers.
+define void @test_r0_r1_r2_r3_r4_stack24() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack24:
+; CHECK: .save   {r0, r1, r2, r3, r4, lr}
+; CHECK: push    {r0, r1, r2, r3, r4, lr}
+; CHECK: pop     {r0, r1, r2, r3, r4, pc}
+  %a = alloca [6 x i32], align 4
+  call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([6 x i32]* %a, i32 42)
+  ret void
+}
+
+define i32 @tail_callee(i32 %a, i32 %b) minsize nounwind {
+entry:
+  tail call void asm sideeffect "", "~{r2}"()
+  ret i32 %a
+}
+
+; The tail call happens outside the save/restore region, so prevents us from
+; preserving some registers. r0 and r1 are outgoing arguments to the tail-call,
+; so can't be preserved. r2 is modified inside the tail-called function, so
+; can't be presrved. r3 is known to be preserved by the callee, so can be
+; presrved. For Thumb1, we can't (efficiently) use a tail-call here, so r1-r3
+; are all preserved, with r0 being the return value.
+define i32 @test_tail_call() minsize nounwind {
+entry:
+; CHECK-LABEL: test_tail_call:
+; ARM: .save   {r3, lr}
+; ARM: push    {r3, lr}
+; ARM: pop     {r3, lr}
+; ARM: b       tail_callee
+; THUMB2: .save   {r3, lr}
+; THUMB2: push    {r3, lr}
+; THUMB2: pop.w   {r3, lr}
+; THUMB2: b       tail_callee
+; THUMB1: .save   {r1, r2, r3, lr}
+; THUMB1: push    {r1, r2, r3, lr}
+; THUMB1: bl      tail_callee
+; THUMB1: pop     {r1, r2, r3, pc}
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"()
+  %call = tail call i32 @tail_callee(i32 3, i32 4)
+  ret i32 %call
+}
+
+declare i32 @tail_callee_external(i32 %a, i32 %b)
+
+; If we tail-call an external function, it could clobber any of r0-r3.
+define i32 @test_tail_call_external() minsize nounwind {
+entry:
+; CHECK-LABEL: test_tail_call_external:
+; ARM: .save   {r11, lr}
+; ARM: push    {r11, lr}
+; ARM: pop     {r11, lr}
+; ARM: b       tail_callee_external
+; THUMB2: .save   {r7, lr}
+; THUMB2: push    {r7, lr}
+; THUMB2: pop.w   {r7, lr}
+; THUMB2: b       tail_callee_external
+; THUMB1: .save   {r1, r2, r3, lr}
+; THUMB1: push    {r1, r2, r3, lr}
+; THUMB1: bl      tail_callee_external
+; THUMB1: pop     {r1, r2, r3, pc}
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"()
+  %call = tail call i32 @tail_callee_external(i32 3, i32 4)
+  ret i32 %call
+}
+
+define linkonce_odr i32 @tail_callee_linkonce_odr(i32 %a, i32 %b) minsize nounwind {
+entry:
+  tail call void asm sideeffect "", "~{r2}"()
+  ret i32 %a
+}
+
+; If a tail-callee has an interposable linkage type (such as linkonce_odr), we
+; can't assume the linker will pick the definition we can see, so must assume
+; it clobbers all of r0-r3.
+define i32 @test_tail_call_linkonce_odr() minsize nounwind {
+entry:
+; CHECK-LABEL: test_tail_call_linkonce_odr:
+; ARM: .save   {r11, lr}
+; ARM: push    {r11, lr}
+; ARM: pop     {r11, lr}
+; ARM: b       tail_callee_linkonce_odr
+; THUMB2: .save   {r7, lr}
+; THUMB2: push    {r7, lr}
+; THUMB2: pop.w   {r7, lr}
+; THUMB2: b       tail_callee_linkonce_odr
+; THUMB1: .save   {r1, r2, r3, lr}
+; THUMB1: push    {r1, r2, r3, lr}
+; THUMB1: bl      tail_callee_linkonce_odr
+; THUMB1: pop     {r1, r2, r3, pc}
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"()
+  %call = tail call i32 @tail_callee_linkonce_odr(i32 3, i32 4)
+  ret i32 %call
+}
+
+; This function doesn't have the nounwind attribute, so unwind tables will be
+; emitted. Saving r0-r3 requires a longer unwind instruction sequence, which
+; results in an increase in total code size if there are few callers to make
+; use of the extra registers.
+define void @test_unwind_tables() minsize {
+; CHECK-LABEL: test_unwind_tables:
+; ARM: .save   {r4, lr}
+; ARM: push    {r4, lr}
+; ARM: pop     {r4, pc}
+; THUMB1: .save   {r4, lr}
+; THUMB1: push    {r4, lr}
+; THUMB1: pop     {r4, pc}
+; THUMB2: .save   {r4, lr}
+; THUMB2: push    {r4, lr}
+; THUMB2: pop     {r4, pc}
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This requires an unwind table, but has many call sites, so overall we expect
+; the benefits to outweigh the size increase of the unwind table.
+define void @test_unwind_tables_many_calls() minsize {
+; CHECK-LABEL: test_unwind_tables_many_calls:
+; ARM: .save   {r0, r4, r11, lr}
+; ARM: push    {r0, r4, r11, lr}
+; ARM: pop     {r0, r4, r11, pc}
+; THUMB1: .save   {r0, r4, r7, lr}
+; THUMB1: push    {r0, r4, r7, lr}
+; THUMB1: pop     {r0, r4, r7, pc}
+; THUMB2: .save   {r0, r4, r7, lr}
+; THUMB2: push    {r0, r4, r7, lr}
+; THUMB2: pop     {r0, r4, r7, pc}
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; We don't do this optimisation is there are no callers in the same translation
+; unit (otherwise IPRA wouldn't be able to take advantage of the extra saved
+; registers), so most functions in this file are called here.
+define void @caller() {
+; CHECK-LABEL: caller:
+  call void @test_r0_r4()
+  call void @test_r0_r1_r2_r3_r4()
+  call void @test_r0_r1_r2_r3()
+  call void @test_r0_r4_not_minsize()
+  call void @test_r0_r4_not_exact()
+  %t1 = call i32 @test_r0_r1_r2_r3_r4_return_1()
+  %t2 = call i64 @test_r0_r1_r2_r3_r4_return_2()
+  %t3 = call i128 @test_r0_r1_r2_r3_r4_return_4()
+  %t4 = call float @test_r0_r1_r2_r3_r4_return_float()
+  call void @test_save_high_regs()
+  call void @test_r0_r1_r2_r3_r4_stack16()
+  call void @test_r0_r1_r2_r3_r4_stack24()
+  %t5 = call i32 @test_tail_call()
+  %t6 = call i32 @test_tail_call_external()
+  %t7 = call i32 @test_tail_call_linkonce_odr()
+  call void @test_unwind_tables()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  ret void
+}

diff  --git a/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll b/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll
index 146a2223c357..63726ce18df1 100644
--- a/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll
+++ b/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll
@@ -66,16 +66,13 @@ return:                                           ; preds = %entry, %if.end
 define void @f3(i32 %x) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    cmp r0, #1
-; CHECK-NEXT:    bne .LBB2_2
-; CHECK-NEXT:  @ %bb.1: @ %t
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    bl fn
-; CHECK-NEXT:    pop.w {r7, lr}
-; CHECK-NEXT:  .LBB2_2: @ %f
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    cmp r0, #1
+; CHECK-NEXT:    itt eq
+; CHECK-NEXT:    moveq r0, #0
+; CHECK-NEXT:    bleq fn
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %p = icmp eq i32 %x, 1
   br i1 %p, label %t, label %f