[llvm] r241920 - ARMLoadStoreOptimizer: Rewrite LDM/STM matching logic.

Fri Jul 10 11:08:49 PDT 2015

Author: matze
Date: Fri Jul 10 13:08:49 2015
New Revision: 241920

URL: http://llvm.org/viewvc/llvm-project?rev=241920&view=rev
Log:
ARMLoadStoreOptimizer: Rewrite LDM/STM matching logic.

This improves the logic in several ways and is a preparation for
followup patches:
- First perform an analysis and create a list of merge candidates, then
  transform. This simplifies the code in that you have don't have to
  care to much anymore that you may be holding iterators to
  MachineInstrs that get removed.
- Analyze/Transform basic blocks in reverse order. This allows to use
  LivePhysRegs to find free registers instead of the RegisterScavenger.
  The RegisterScavenger will become less precise in the future as it
  relies on the deprecated kill-flags.
- Return the newly created node in MergeOps so there's no need to look
  around in the schedule to find it.
- Rename some MBBI iterators to InsertBefore to make their role clear.
- General code cleanup.

Differential Revision: http://reviews.llvm.org/D10140

Modified:
    llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
    llvm/trunk/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll

Modified: llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp?rev=241920&r1=241919&r2=241920&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp Fri Jul 10 13:08:49 2015
@@ -31,11 +31,13 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -65,12 +67,18 @@ namespace {
     static char ID;
     ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
 
+    const MachineFunction *MF;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
     const ARMSubtarget *STI;
     const TargetLowering *TL;
     ARMFunctionInfo *AFI;
-    RegScavenger *RS;
+    LivePhysRegs LiveRegs;
+    RegisterClassInfo RegClassInfo;
+    MachineBasicBlock::const_iterator LiveRegPos;
+    bool LiveRegsValid;
+    bool RegClassInfoValid;
     bool isThumb1, isThumb2;
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -80,64 +88,51 @@ namespace {
     }
 
   private:
+    /// A set of load/store MachineInstrs with same base register sorted by
+    /// offset.
     struct MemOpQueueEntry {
-      int Offset;
-      unsigned Reg;
-      bool isKill;
-      unsigned Position;
-      MachineBasicBlock::iterator MBBI;
-      bool Merged;
-      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p,
-                      MachineBasicBlock::iterator i)
-        : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
+      MachineInstr *MI;
+      int Offset;        ///< Load/Store offset.
+      unsigned Position; ///< Position as counted from end of basic block.
+      MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
+        : MI(MI), Offset(Offset), Position(Position) {}
     };
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
-    typedef MemOpQueue::iterator MemOpQueueIter;
 
-    void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
-                          const MemOpQueue &MemOps, unsigned DefReg,
-                          unsigned RangeBegin, unsigned RangeEnd);
+    /// A set of MachineInstrs that fulfill (nearly all) conditions to get
+    /// merged into a LDM/STM.
+    struct MergeCandidate {
+      /// List of instructions ordered by load/store offset.
+      SmallVector<MachineInstr*, 4> Instrs;
+      /// Index in Instrs of the instruction being latest in the schedule.
+      unsigned LatestMIIdx;
+      /// Index in Instrs of the instruction being earliest in the schedule.
+      unsigned EarliestMIIdx;
+      /// Index into the basic block where the merged instruction will be
+      /// inserted. (See MemOpQueueEntry.Position)
+      unsigned InsertPos;
+    };
+    BumpPtrAllocator Allocator;
+    SmallVector<const MergeCandidate*,4> Candidates;
+
+    void moveLiveRegsBefore(const MachineBasicBlock &MBB,
+                            MachineBasicBlock::const_iterator Before);
+    unsigned findFreeReg(const TargetRegisterClass &RegClass);
     void UpdateBaseRegUses(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           DebugLoc dl, unsigned Base, unsigned WordOffset,
+                           DebugLoc DL, unsigned Base, unsigned WordOffset,
                            ARMCC::CondCodes Pred, unsigned PredReg);
-    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                  int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
-                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
-                  DebugLoc dl,
-                  ArrayRef<std::pair<unsigned, bool> > Regs,
-                  ArrayRef<unsigned> ImpDefs);
-    void MergeOpsUpdate(MachineBasicBlock &MBB,
-                        MemOpQueue &MemOps,
-                        unsigned memOpsBegin,
-                        unsigned memOpsEnd,
-                        unsigned insertAfter,
-                        int Offset,
-                        unsigned Base,
-                        bool BaseKill,
-                        unsigned Opcode,
-                        ARMCC::CondCodes Pred,
-                        unsigned PredReg,
-                        unsigned Scratch,
-                        DebugLoc dl,
-                        SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
-    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
-                      unsigned Opcode, unsigned Size,
-                      ARMCC::CondCodes Pred, unsigned PredReg,
-                      unsigned Scratch, MemOpQueue &MemOps,
-                      SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
-    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
+    MachineInstr *MergeOps(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertBefore, int Offset,
+                           unsigned Base, bool BaseKill, unsigned Opcode,
+                           ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL,
+                           ArrayRef<std::pair<unsigned, bool>> Regs);
+    void FormCandidates(const MemOpQueue &MemOps);
+    MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI);
-    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MBBI,
-                                  const TargetInstrInfo *TII,
-                                  bool &Advance,
-                                  MachineBasicBlock::iterator &I);
-    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   bool &Advance,
-                                   MachineBasicBlock::iterator &I);
+    bool MergeBaseUpdateLoadStore(MachineInstr *MI);
+    bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
   };
@@ -185,6 +180,14 @@ static int getMemoryOpOffset(const Machi
   return Offset;
 }
 
+static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
+  return MI.getOperand(1);
+}
+
+static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
+  return MI.getOperand(0);
+}
+
 static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
   switch (Opcode) {
   default: llvm_unreachable("Unhandled opcode!");
@@ -348,6 +351,10 @@ static bool isi32Store(unsigned Opc) {
   return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
 }
 
+static bool isLoadSingle(unsigned Opc) {
+  return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+}
+
 static unsigned getImmScale(unsigned Opc) {
   switch (Opc) {
   default: llvm_unreachable("Unhandled opcode!");
@@ -365,12 +372,55 @@ static unsigned getImmScale(unsigned Opc
   }
 }
 
+static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDRi12:
+  case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+  case ARM::tLDRspi:
+  case ARM::tSTRspi:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return 4;
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return 8;
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::VLDMSIA:
+  case ARM::VSTMSIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
+  case ARM::VLDMDIA:
+  case ARM::VSTMDIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
+  }
+}
+
 /// Update future uses of the base register with the offset introduced
 /// due to writeback. This function only works on Thumb1.
 void
 ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
-                                   DebugLoc dl, unsigned Base,
+                                   DebugLoc DL, unsigned Base,
                                    unsigned WordOffset,
                                    ARMCC::CondCodes Pred, unsigned PredReg) {
   assert(isThumb1 && "Can only update base register uses for Thumb1!");
@@ -398,7 +448,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(Machi
         Offset = MO.getImm() - WordOffset * getImmScale(Opc);
 
         // If storing the base register, it needs to be reset first.
-        unsigned InstrSrcReg = MBBI->getOperand(0).getReg();
+        unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
 
         if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
           MO.setImm(Offset);
@@ -439,7 +489,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(Machi
 
     if (InsertSub) {
       // An instruction above couldn't be updated, so insert a sub.
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true)
+      AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
         .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
       return;
     }
@@ -457,31 +507,67 @@ ARMLoadStoreOpt::UpdateBaseRegUses(Machi
     // See PR21029.
     if (MBBI != MBB.end()) --MBBI;
     AddDefaultT1CC(
-      BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true)
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
       .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
   }
 }
 
+/// Return the first register of class \p RegClass that is not in \p Regs.
+unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
+  if (!RegClassInfoValid) {
+    RegClassInfo.runOnMachineFunction(*MF);
+    RegClassInfoValid = true;
+  }
+
+  for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
+    if (!LiveRegs.contains(Reg))
+      return Reg;
+  return 0;
+}
+
+/// Compute live registers just before instruction \p Before (in normal schedule
+/// direction). Computes backwards so multiple queries in the same block must
+/// come in reverse order.
+void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
+    MachineBasicBlock::const_iterator Before) {
+  // Initialize if we never queried in this block.
+  if (!LiveRegsValid) {
+    LiveRegs.init(TRI);
+    LiveRegs.addLiveOuts(&MBB, true);
+    LiveRegPos = MBB.end();
+    LiveRegsValid = true;
+  }
+  // Move backward just before the "Before" position.
+  while (LiveRegPos != Before) {
+    --LiveRegPos;
+    LiveRegs.stepBackward(*LiveRegPos);
+  }
+}
+
+static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
+                        unsigned Reg) {
+  for (const std::pair<unsigned, bool> &R : Regs)
+    if (R.first == Reg)
+      return true;
+  return false;
+}
+
 /// Create and insert a LDM or STM with Base as base register and registers in
 /// Regs as the register operands that would be loaded / stored.  It returns
 /// true if the transformation is done.
-bool
+MachineInstr *
 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI,
-                          int Offset, unsigned Base, bool BaseKill,
-                          unsigned Opcode, ARMCC::CondCodes Pred,
-                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
-                          ArrayRef<std::pair<unsigned, bool> > Regs,
-                          ArrayRef<unsigned> ImpDefs) {
-  // Only a single register to load / store. Don't bother.
+                          MachineBasicBlock::iterator InsertBefore, int Offset,
+                          unsigned Base, bool BaseKill, unsigned Opcode,
+                          ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL,
+                          ArrayRef<std::pair<unsigned, bool>> Regs) {
   unsigned NumRegs = Regs.size();
-  if (NumRegs <= 1)
-    return false;
+  assert(NumRegs > 1);
 
   // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
   // Compute liveness information for that register to make the decision.
   bool SafeToClobberCPSR = !isThumb1 ||
-    (MBB.computeRegisterLiveness(TRI, ARM::CPSR, std::prev(MBBI), 15) ==
+    (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
      MachineBasicBlock::LQR_Dead);
 
   bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
@@ -489,17 +575,14 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
   // Exception: If the base register is in the input reglist, Thumb1 LDM is
   // non-writeback.
   // It's also not possible to merge an STR of the base register in Thumb1.
-  if (isThumb1)
-    for (const std::pair<unsigned, bool> &R : Regs)
-      if (Base == R.first) {
-        assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
-        if (Opcode == ARM::tLDRi) {
-          Writeback = false;
-          break;
-        } else if (Opcode == ARM::tSTRi) {
-          return false;
-        }
-      }
+  if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
+    assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
+    if (Opcode == ARM::tLDRi) {
+      Writeback = false;
+    } else if (Opcode == ARM::tSTRi) {
+      return nullptr;
+    }
+  }
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
   // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
@@ -516,18 +599,18 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
   } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
     // Check if this is a supported opcode before inserting instructions to
     // calculate a new base register.
-    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
+    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
 
     // If starting offset isn't zero, insert a MI to materialize a new base.
     // But only do so if it is cost effective, i.e. merging more than two
     // loads / stores.
     if (NumRegs <= 2)
-      return false;
+      return nullptr;
 
     // On Thumb1, it's not worth materializing a new base register without
     // clobbering the CPSR (i.e. not using ADDS/SUBS).
     if (!SafeToClobberCPSR)
-      return false;
+      return nullptr;
 
     unsigned NewBase;
     if (isi32Load(Opcode)) {
@@ -535,10 +618,17 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
       // use as the new base.
       NewBase = Regs[NumRegs-1].first;
     } else {
-      // Use the scratch register to use as a new base.
-      NewBase = Scratch;
+      // Find a free register that we can use as scratch register.
+      moveLiveRegsBefore(MBB, InsertBefore);
+      // The merged instruction does not exist yet but will use several Regs if
+      // it is a Store.
+      if (!isLoadSingle(Opcode))
+        for (const std::pair<unsigned, bool> &R : Regs)
+          LiveRegs.addReg(R.first);
+
+      NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
       if (NewBase == 0)
-        return false;
+        return nullptr;
     }
 
     int BaseOpc =
@@ -557,7 +647,12 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
 
     if (!TL->isLegalAddImmediate(Offset))
       // FIXME: Try add with register operand?
-      return false; // Probably not worth it then.
+      return nullptr; // Probably not worth it then.
+
+    // We can only append a kill flag to the add/sub input if the value is not
+    // used in the register list of the stm as well.
+    bool KillOldBase = BaseKill &&
+      (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
 
     if (isThumb1) {
       // Thumb1: depending on immediate size, use either
@@ -572,43 +667,44 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
             !STI->hasV6Ops()) {
           // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
           if (Pred != ARMCC::AL)
-            return false;
-          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVSr), NewBase)
-            .addReg(Base, getKillRegState(BaseKill));
+            return nullptr;
+          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
+            .addReg(Base, getKillRegState(KillOldBase));
         } else
-          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
-            .addReg(Base, getKillRegState(BaseKill))
+          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
+            .addReg(Base, getKillRegState(KillOldBase))
             .addImm(Pred).addReg(PredReg);
 
-        // Set up BaseKill and Base correctly to insert the ADDS/SUBS below.
+        // The following ADDS/SUBS becomes an update.
         Base = NewBase;
-        BaseKill = false;
+        KillOldBase = true;
       }
       if (BaseOpc == ARM::tADDrSPi) {
         assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
-        BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
-          .addReg(Base, getKillRegState(BaseKill)).addImm(Offset/4)
+        BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
           .addImm(Pred).addReg(PredReg);
       } else
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true)
-          .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+        AddDefaultT1CC(
+          BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
+          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
           .addImm(Pred).addReg(PredReg);
     } else {
-      BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
-        .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+      BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+        .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
         .addImm(Pred).addReg(PredReg).addReg(0);
     }
     Base = NewBase;
     BaseKill = true; // New base is always killed straight away.
   }
 
-  bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
-                Opcode == ARM::VLDRD);
+  bool isDef = isLoadSingle(Opcode);
 
   // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
   // base register writeback.
   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
-  if (!Opcode) return false;
+  if (!Opcode)
+    return nullptr;
 
   // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
   // - There is no writeback (LDM of base register),
@@ -619,7 +715,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
   // It's safe to return here since the code to materialize a new base register
   // above is also conditional on SafeToClobberCPSR.
   if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
-    return false;
+    return nullptr;
 
   MachineInstrBuilder MIB;
 
@@ -628,7 +724,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
       // Update tLDMIA with writeback if necessary.
       Opcode = ARM::tLDMIA_UPD;
 
-    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
 
     // Thumb1: we might need to set base writeback when building the MI.
     MIB.addReg(Base, getDefRegState(true))
@@ -637,214 +733,144 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
     // The base isn't dead after a merged instruction with writeback.
     // Insert a sub instruction after the newly formed instruction to reset.
     if (!BaseKill)
-      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
+      UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
 
   } else {
     // No writeback, simply build the MachineInstr.
-    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
     MIB.addReg(Base, getKillRegState(BaseKill));
   }
 
   MIB.addImm(Pred).addReg(PredReg);
 
   for (const std::pair<unsigned, bool> &R : Regs)
-    MIB = MIB.addReg(R.first, getDefRegState(isDef)
-                     | getKillRegState(R.second));
-
-  // Add implicit defs for super-registers.
-  for (unsigned ImpDef : ImpDefs)
-    MIB.addReg(ImpDef, RegState::ImplicitDefine);
+    MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
 
-  return true;
-}
-
-/// Find all instructions using a given imp-def within a range.
-///
-/// We are trying to combine a range of instructions, one of which (located at
-/// position RangeBegin) implicitly defines a register. The final LDM/STM will
-/// be placed at RangeEnd, and so any uses of this definition between RangeStart
-/// and RangeEnd must be modified to use an undefined value.
-///
-/// The live range continues until we find a second definition or one of the
-/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so
-/// we must consider all uses and decide which are relevant in a second pass.
-void ARMLoadStoreOpt::findUsesOfImpDef(
-    SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps,
-    unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) {
-  std::map<unsigned, MachineOperand *> Uses;
-  unsigned LastLivePos = RangeEnd;
-
-  // First we find all uses of this register with Position between RangeBegin
-  // and RangeEnd, any or all of these could be uses of a definition at
-  // RangeBegin. We also record the latest position a definition at RangeBegin
-  // would be considered live.
-  for (unsigned i = 0; i < MemOps.size(); ++i) {
-    MachineInstr &MI = *MemOps[i].MBBI;
-    unsigned MIPosition = MemOps[i].Position;
-    if (MIPosition <= RangeBegin || MIPosition > RangeEnd)
-      continue;
-
-    // If this instruction defines the register, then any later use will be of
-    // that definition rather than ours.
-    if (MI.definesRegister(DefReg))
-      LastLivePos = std::min(LastLivePos, MIPosition);
-
-    MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg);
-    if (!UseOp)
-      continue;
-
-    // If this instruction kills the register then (assuming liveness is
-    // correct when we start) we don't need to think about anything after here.
-    if (UseOp->isKill())
-      LastLivePos = std::min(LastLivePos, MIPosition);
-
-    Uses[MIPosition] = UseOp;
-  }
-
-  // Now we traverse the list of all uses, and append the ones that actually use
-  // our definition to the requested list.
-  for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(),
-                                                      E = Uses.end();
-       I != E; ++I) {
-    // List is sorted by position so once we've found one out of range there
-    // will be no more to consider.
-    if (I->first > LastLivePos)
-      break;
-    UsesOfImpDefs.push_back(I->second);
-  }
+  return MIB.getInstr();
 }
 
 /// Call MergeOps and update MemOps and merges accordingly on success.
-void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
-                                     MemOpQueue &memOps,
-                                     unsigned memOpsBegin, unsigned memOpsEnd,
-                                     unsigned insertAfter, int Offset,
-                                     unsigned Base, bool BaseKill,
-                                     unsigned Opcode,
-                                     ARMCC::CondCodes Pred, unsigned PredReg,
-                                     unsigned Scratch,
-                                     DebugLoc dl,
-                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
-  // First calculate which of the registers should be killed by the merged
-  // instruction.
-  const unsigned insertPos = memOps[insertAfter].Position;
-  SmallSet<unsigned, 4> KilledRegs;
-  DenseMap<unsigned, unsigned> Killer;
-  for (unsigned i = 0, e = memOps.size(); i != e; ++i) {
-    if (i == memOpsBegin) {
-      i = memOpsEnd;
-      if (i == e)
-        break;
-    }
-    if (memOps[i].Position < insertPos && memOps[i].isKill) {
-      unsigned Reg = memOps[i].Reg;
+MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
+  const MachineInstr *First = Cand.Instrs.front();
+  unsigned Opcode = First->getOpcode();
+  bool IsLoad = isLoadSingle(Opcode);
+  SmallVector<std::pair<unsigned, bool>, 8> Regs;
+  SmallVector<unsigned, 4> ImpDefs;
+  DenseSet<unsigned> KilledRegs;
+  // Determine list of registers and list of implicit super-register defs.
+  for (const MachineInstr *MI : Cand.Instrs) {
+    const MachineOperand &MO = getLoadStoreRegOp(*MI);
+    unsigned Reg = MO.getReg();
+    bool IsKill = MO.isKill();
+    if (IsKill)
       KilledRegs.insert(Reg);
-      Killer[Reg] = i;
+    Regs.push_back(std::make_pair(Reg, IsKill));
+
+    if (IsLoad) {
+      // Collect any implicit defs of super-registers, after merging we can't
+      // be sure anymore that we properly preserved these live ranges and must
+      // removed these implicit operands.
+      for (const MachineOperand &MO : MI->implicit_operands()) {
+        if (!MO.isReg() || !MO.isDef() || MO.isDead())
+          continue;
+        assert(MO.isImplicit());
+        unsigned DefReg = MO.getReg();
+
+        if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end())
+          continue;
+        // We can ignore cases where the super-reg is read and written.
+        if (MI->readsRegister(DefReg))
+          continue;
+        ImpDefs.push_back(DefReg);
+      }
     }
   }
 
-  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    MachineOperand &TransferOp = memOps[i].MBBI->getOperand(0);
-    if (TransferOp.isUse() && TransferOp.getReg() == Base)
-      BaseKill = false;
+  // Attempt the merge.
+  typedef MachineBasicBlock::iterator iterator;
+  MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
+  iterator InsertBefore = std::next(iterator(LatestMI));
+  MachineBasicBlock &MBB = *LatestMI->getParent();
+  unsigned Offset = getMemoryOpOffset(First);
+  unsigned Base = getLoadStoreBaseOp(*First).getReg();
+  bool BaseKill = LatestMI->killsRegister(Base);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg);
+  DebugLoc DL = First->getDebugLoc();
+  MachineInstr *Merged = MergeOps(MBB, InsertBefore, Offset, Base, BaseKill,
+                                  Opcode, Pred, PredReg, DL, Regs);
+  if (!Merged)
+    return nullptr;
+
+  // Determine earliest instruction that will get removed. We then keep an
+  // iterator just above it so the following erases don't invalidated it.
+  iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
+  bool EarliestAtBegin = false;
+  if (EarliestI == MBB.begin()) {
+    EarliestAtBegin = true;
+  } else {
+    EarliestI = std::prev(EarliestI);
   }
 
-  SmallVector<std::pair<unsigned, bool>, 8> Regs;
-  SmallVector<unsigned, 8> ImpDefs;
-  SmallVector<MachineOperand *, 8> UsesOfImpDefs;
-  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    unsigned Reg = memOps[i].Reg;
-    // If we are inserting the merged operation after an operation that
-    // uses the same register, make sure to transfer any kill flag.
-    bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
-    Regs.push_back(std::make_pair(Reg, isKill));
-
-    // Collect any implicit defs of super-registers. They must be preserved.
-    for (const MachineOperand &MO : memOps[i].MBBI->operands()) {
-      if (!MO.isReg() || !MO.isDef() || !MO.isImplicit() || MO.isDead())
-        continue;
-      unsigned DefReg = MO.getReg();
-      if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
-        ImpDefs.push_back(DefReg);
-
-      // There may be other uses of the definition between this instruction and
-      // the eventual LDM/STM position. These should be marked undef if the
-      // merge takes place.
-      findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position,
-                       insertPos);
+  // Remove instructions which have been merged.
+  for (MachineInstr *MI : Cand.Instrs)
+    MBB.erase(MI);
+
+  // Determine range between the earliest removed instruction and the new one.
+  if (EarliestAtBegin)
+    EarliestI = MBB.begin();
+  else
+    EarliestI = std::next(EarliestI);
+  auto FixupRange = make_range(EarliestI, iterator(Merged));
+
+  if (isLoadSingle(Opcode)) {
+    // If the previous loads defined a super-reg, then we have to mark earlier
+    // operands undef; Replicate the super-reg def on the merged instruction.
+    for (MachineInstr &MI : FixupRange) {
+      for (unsigned &ImpDefReg : ImpDefs) {
+        for (MachineOperand &MO : MI.implicit_operands()) {
+          if (!MO.isReg() || MO.getReg() != ImpDefReg)
+            continue;
+          if (MO.readsReg())
+            MO.setIsUndef();
+          else if (MO.isDef())
+            ImpDefReg = 0;
+        }
+      }
     }
-  }
 
-  // Try to do the merge.
-  MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
-  ++Loc;
-  if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
-                Pred, PredReg, Scratch, dl, Regs, ImpDefs))
-    return;
-
-  // Merge succeeded, update records.
-  Merges.push_back(std::prev(Loc));
-
-  // In gathering loads together, we may have moved the imp-def of a register
-  // past one of its uses. This is OK, since we know better than the rest of
-  // LLVM what's OK with ARM loads and stores; but we still have to adjust the
-  // affected uses.
-  for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
-                                                   E = UsesOfImpDefs.end();
-                                                   I != E; ++I)
-    (*I)->setIsUndef();
-
-  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    // Remove kill flags from any memops that come before insertPos.
-    if (Regs[i-memOpsBegin].second) {
-      unsigned Reg = Regs[i-memOpsBegin].first;
-      if (KilledRegs.count(Reg)) {
-        unsigned j = Killer[Reg];
-        int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true);
-        assert(Idx >= 0 && "Cannot find killing operand");
-        memOps[j].MBBI->getOperand(Idx).setIsKill(false);
-        memOps[j].isKill = false;
+    MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
+    for (unsigned ImpDef : ImpDefs)
+      MIB.addReg(ImpDef, RegState::ImplicitDefine);
+  } else {
+    // Remove kill flags: We are possibly storing the values later now.
+    assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
+    for (MachineInstr &MI : FixupRange) {
+      for (MachineOperand &MO : MI.uses()) {
+        if (!MO.isReg() || !MO.isKill())
+          continue;
+        if (KilledRegs.count(MO.getReg()))
+          MO.setIsKill(false);
       }
-      memOps[i].isKill = true;
     }
-    MBB.erase(memOps[i].MBBI);
-    // Update this memop to refer to the merged instruction.
-    // We may need to move kill flags again.
-    memOps[i].Merged = true;
-    memOps[i].MBBI = Merges.back();
-    memOps[i].Position = insertPos;
+    assert(ImpDefs.empty());
   }
 
-  // Update memOps offsets, since they may have been modified by MergeOps.
-  for (auto &MemOp : memOps) {
-    MemOp.Offset = getMemoryOpOffset(MemOp.MBBI);
-  }
+  return Merged;
 }
 
-/// Merge a number of load / store instructions into one or more load / store
-/// multiple instructions.
-void
-ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
-                         unsigned Base, unsigned Opcode, unsigned Size,
-                         ARMCC::CondCodes Pred, unsigned PredReg,
-                         unsigned Scratch, MemOpQueue &MemOps,
-                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
+/// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
+void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
+  const MachineInstr *FirstMI = MemOps[0].MI;
+  unsigned Opcode = FirstMI->getOpcode();
   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
-  int Offset = MemOps[SIndex].Offset;
-  int SOffset = Offset;
-  unsigned insertAfter = SIndex;
-  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
-  DebugLoc dl = Loc->getDebugLoc();
-  const MachineOperand &PMO = Loc->getOperand(0);
-  unsigned PReg = PMO.getReg();
-  unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
-  unsigned Count = 1;
-  unsigned Limit = ~0U;
-  bool BaseKill = false;
+  unsigned Size = getLSMultipleTransferSize(FirstMI);
   // vldm / vstm limit are 32 for S variants, 16 for D variants.
-
+  unsigned Limit;
   switch (Opcode) {
-  default: break;
+  default:
+    Limit = UINT_MAX;
+    break;
   case ARM::VSTRS:
     Limit = 32;
     break;
@@ -859,46 +885,64 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBas
     break;
   }
 
-  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
-    int NewOffset = MemOps[i].Offset;
-    const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
-    unsigned Reg = MO.getReg();
-    unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
-    // Register numbers must be in ascending order. For VFP / NEON load and
-    // store multiples, the registers must also be consecutive and within the
-    // limit on the number of registers per instruction.
-    if (Reg != ARM::SP &&
-        NewOffset == Offset + (int)Size &&
-        ((isNotVFP && RegNum > PRegNum) ||
-         ((Count < Limit) && RegNum == PRegNum+1)) &&
-        // On Swift we don't want vldm/vstm to start with a odd register num
-        // because Q register unaligned vldm/vstm need more uops.
-        (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) {
+  unsigned SIndex = 0;
+  unsigned EIndex = MemOps.size();
+  do {
+    // Look at the first instruction.
+    const MachineInstr *MI = MemOps[SIndex].MI;
+    int Offset = MemOps[SIndex].Offset;
+    const MachineOperand &PMO = getLoadStoreRegOp(*MI);
+    unsigned PReg = PMO.getReg();
+    unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
+    unsigned Latest = SIndex;
+    unsigned Earliest = SIndex;
+    unsigned Count = 1;
+
+    // Merge additional instructions fulfilling LDM/STM constraints.
+    for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
+      int NewOffset = MemOps[I].Offset;
+      if (NewOffset != Offset + (int)Size)
+        break;
+      const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
+      unsigned Reg = MO.getReg();
+      if (Reg == ARM::SP)
+        break;
+      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+      // Register numbers must be in ascending order.
+      if (RegNum <= PRegNum)
+        break;
+      // For VFP / NEON load/store multiples, the registers must be consecutive
+      // and within the limit on the number of registers per instruction.
+      if (!isNotVFP && RegNum != PRegNum+1)
+        break;
+      // On Swift we don't want vldm/vstm to start with a odd register num
+      // because Q register unaligned vldm/vstm need more uops.
+      if (!isNotVFP && STI->isSwift() && Count == 1 && (PRegNum % 2) == 1)
+        break;
+
+      // Track MemOp with latest and earliest position (Positions are
+      // counted in reverse).
+      unsigned Position = MemOps[I].Position;
+      if (Position < MemOps[Latest].Position)
+        Latest = I;
+      else if (Position > MemOps[Earliest].Position)
+        Earliest = I;
+      // Prepare for next MemOp.
       Offset += Size;
       PRegNum = RegNum;
-      ++Count;
-    } else {
-      // Can't merge this in. Try merge the earlier ones first.
-      // We need to compute BaseKill here because the MemOps may have been
-      // reordered.
-      BaseKill = Loc->killsRegister(Base);
-
-      MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, Base,
-                     BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
-      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
-                   MemOps, Merges);
-      return;
     }
 
-    if (MemOps[i].Position > MemOps[insertAfter].Position) {
-      insertAfter = i;
-      Loc = MemOps[i].MBBI;
-    }
-  }
-
-  BaseKill =  Loc->killsRegister(Base);
-  MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
-                 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
+    // Form a candidate from the Ops collected so far.
+    MergeCandidate *Candidate = new(Allocator) MergeCandidate;
+    for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
+      Candidate->Instrs.push_back(MemOps[C].MI);
+    Candidate->LatestMIIdx = Latest - SIndex;
+    Candidate->EarliestMIIdx = Earliest - SIndex;
+    Candidate->InsertPos = MemOps[Latest].Position;
+    Candidates.push_back(Candidate);
+    // Continue after the chain.
+    SIndex += Count;
+  } while (SIndex < EIndex);
 }
 
 static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
@@ -971,49 +1015,6 @@ static bool isMatchingIncrement(MachineI
   return CheckCPSRDef ? !definesCPSR(MI) : true;
 }
 
-static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  default: return 0;
-  case ARM::LDRi12:
-  case ARM::STRi12:
-  case ARM::tLDRi:
-  case ARM::tSTRi:
-  case ARM::tLDRspi:
-  case ARM::tSTRspi:
-  case ARM::t2LDRi8:
-  case ARM::t2LDRi12:
-  case ARM::t2STRi8:
-  case ARM::t2STRi12:
-  case ARM::VLDRS:
-  case ARM::VSTRS:
-    return 4;
-  case ARM::VLDRD:
-  case ARM::VSTRD:
-    return 8;
-  case ARM::LDMIA:
-  case ARM::LDMDA:
-  case ARM::LDMDB:
-  case ARM::LDMIB:
-  case ARM::STMIA:
-  case ARM::STMDA:
-  case ARM::STMDB:
-  case ARM::STMIB:
-  case ARM::tLDMIA:
-  case ARM::tLDMIA_UPD:
-  case ARM::tSTMIA_UPD:
-  case ARM::t2LDMIA:
-  case ARM::t2LDMDB:
-  case ARM::t2STMIA:
-  case ARM::t2STMDB:
-  case ARM::VLDMSIA:
-  case ARM::VSTMSIA:
-    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
-  case ARM::VLDMDIA:
-  case ARM::VSTMDIA:
-    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
-  }
-}
-
 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
                                             ARM_AM::AMSubMode Mode) {
   switch (Opc) {
@@ -1093,21 +1094,18 @@ static unsigned getUpdatingLSMultipleOpc
 /// ldmia rn, <ra, rb, rc>
 /// =>
 /// ldmdb rn!, <ra, rb, rc>
-bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MBBI,
-                                               bool &Advance,
-                                               MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   // Thumb1 is already using updating loads/stores.
   if (isThumb1) return false;
 
-  MachineInstr *MI = MBBI;
-  unsigned Base = MI->getOperand(0).getReg();
-  bool BaseKill = MI->getOperand(0).isKill();
+  const MachineOperand &BaseOP = MI->getOperand(0);
+  unsigned Base = BaseOP.getReg();
+  bool BaseKill = BaseOP.isKill();
   unsigned Bytes = getLSMultipleTransferSize(MI);
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   unsigned Opcode = MI->getOpcode();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
 
   // Can't use an updating ld/st if the base register is also a dest
   // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
@@ -1119,7 +1117,9 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSM
   ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
 
   // Try merging with the previous instruction.
+  MachineBasicBlock &MBB = *MI->getParent();
   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  MachineBasicBlock::iterator MBBI(MI);
   if (MBBI != BeginMBBI) {
     MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
     while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
@@ -1150,20 +1150,15 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSM
                isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
       DoMerge = true;
     }
-    if (DoMerge) {
-      if (NextMBBI == I) {
-        Advance = true;
-        ++I;
-      }
+    if (DoMerge)
       MBB.erase(NextMBBI);
-    }
   }
 
   if (!DoMerge)
     return false;
 
   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
-  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
     .addReg(Base, getDefRegState(true)) // WB base register
     .addReg(Base, getKillRegState(BaseKill))
     .addImm(Pred).addReg(PredReg);
@@ -1231,21 +1226,16 @@ static unsigned getPostIndexedLoadStoreO
 
 /// Fold proceeding/trailing inc/dec of base register into the
 /// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
-bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MBBI,
-                                               const TargetInstrInfo *TII,
-                                               bool &Advance,
-                                               MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   // Thumb1 doesn't have updating LDR/STR.
   // FIXME: Use LDM/STM with single register instead.
   if (isThumb1) return false;
 
-  MachineInstr *MI = MBBI;
-  unsigned Base = MI->getOperand(1).getReg();
-  bool BaseKill = MI->getOperand(1).isKill();
+  unsigned Base = getLoadStoreBaseOp(*MI).getReg();
+  bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
   unsigned Bytes = getLSMultipleTransferSize(MI);
   unsigned Opcode = MI->getOpcode();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
   bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
                 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
   bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
@@ -1255,7 +1245,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoa
   if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
     return false;
 
-  bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
+  bool isLd = isLoadSingle(Opcode);
   // Can't do the merge if the destination register is the same as the would-be
   // writeback register.
   if (MI->getOperand(0).getReg() == Base)
@@ -1270,7 +1260,9 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoa
   unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
 
   // Try merging with the previous instruction.
+  MachineBasicBlock &MBB = *MI->getParent();
   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  MachineBasicBlock::iterator MBBI(MI);
   if (MBBI != BeginMBBI) {
     MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
     while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
@@ -1303,10 +1295,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoa
     }
     if (DoMerge) {
       NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
-      if (NextMBBI == I) {
-        Advance = true;
-        ++I;
-      }
       MBB.erase(NextMBBI);
     }
   }
@@ -1320,7 +1308,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoa
     // updating load/store-multiple instructions can be used with only one
     // register.)
     MachineOperand &MO = MI->getOperand(0);
-    BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+    BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
       .addReg(Base, getDefRegState(true)) // WB base register
       .addReg(Base, getKillRegState(isLd ? BaseKill : false))
       .addImm(Pred).addReg(PredReg)
@@ -1331,19 +1319,19 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoa
       // LDR_PRE, LDR_POST
       if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
         int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
-        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
           .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
       } else {
         int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
-        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
           .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
       }
     } else {
       int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2LDR_PRE, t2LDR_POST
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
         .addReg(Base, RegState::Define)
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
     }
@@ -1355,13 +1343,13 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoa
     if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
       int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
         .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
     } else {
       int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2STR_PRE, t2STR_POST
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
     }
@@ -1426,26 +1414,10 @@ static bool isMemoryOp(const MachineInst
   return false;
 }
 
-/// Advance register scavenger to just before the earliest memory op that is
-/// being merged.
-void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
-  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
-  unsigned Position = MemOps[0].Position;
-  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
-    if (MemOps[i].Position < Position) {
-      Position = MemOps[i].Position;
-      Loc = MemOps[i].MBBI;
-    }
-  }
-
-  if (Loc != MBB.begin())
-    RS->forward(std::prev(Loc));
-}
-
 static void InsertLDR_STR(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator &MBBI,
                           int Offset, bool isDef,
-                          DebugLoc dl, unsigned NewOpc,
+                          DebugLoc DL, unsigned NewOpc,
                           unsigned Reg, bool RegDeadKill, bool RegUndef,
                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
                           bool OffKill, bool OffUndef,
@@ -1491,7 +1463,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairO
   if (!Errata602117 && !NonConsecutiveRegs)
     return false;
 
-  MachineBasicBlock::iterator NewBBI = MBBI;
   bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
   bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
   bool EvenDeadKill = isLd ?
@@ -1531,7 +1502,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairO
                 getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
       ++NumSTRD2STM;
     }
-    NewBBI = std::prev(MBBI);
   } else {
     // Split into two instructions.
     unsigned NewOpc = (isLd)
@@ -1553,7 +1523,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairO
                     OddReg, OddDeadKill, false,
                     BaseReg, false, BaseUndef, false, OffUndef,
                     Pred, PredReg, TII, isT2);
-      NewBBI = std::prev(MBBI);
       InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                     EvenReg, EvenDeadKill, false,
                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1573,7 +1542,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairO
                     EvenReg, EvenDeadKill, EvenUndef,
                     BaseReg, false, BaseUndef, false, OffUndef,
                     Pred, PredReg, TII, isT2);
-      NewBBI = std::prev(MBBI);
       InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
                     OddReg, OddDeadKill, OddUndef,
                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1585,16 +1553,13 @@ bool ARMLoadStoreOpt::FixInvalidRegPairO
       ++NumSTRD2STR;
   }
 
-  MBB.erase(MI);
-  MBBI = NewBBI;
+  MBBI = MBB.erase(MBBI);
   return true;
 }
 
 /// An optimization pass to turn multiple LDR / STR ops of the same base and
 /// incrementing offset into LDM / STM ops.
 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
-  unsigned NumMerges = 0;
-  unsigned NumMemOps = 0;
   MemOpQueue MemOps;
   unsigned CurrBase = 0;
   unsigned CurrOpc = ~0u;
@@ -1602,174 +1567,137 @@ bool ARMLoadStoreOpt::LoadStoreMultipleO
   ARMCC::CondCodes CurrPred = ARMCC::AL;
   unsigned CurrPredReg = 0;
   unsigned Position = 0;
-  SmallVector<MachineBasicBlock::iterator,4> Merges;
+  assert(Candidates.size() == 0);
+  LiveRegsValid = false;
 
-  RS->enterBasicBlock(&MBB);
-  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-  while (MBBI != E) {
+  for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
+       I = MBBI) {
+    // The instruction in front of the iterator is the one we look at.
+    MBBI = std::prev(I);
     if (FixInvalidRegPairOp(MBB, MBBI))
       continue;
+    ++Position;
 
-    bool Advance  = false;
-    bool TryMerge = false;
-
-    bool isMemOp = isMemoryOp(MBBI);
-    if (isMemOp) {
+    if (isMemoryOp(MBBI)) {
       unsigned Opcode = MBBI->getOpcode();
       unsigned Size = getLSMultipleTransferSize(MBBI);
       const MachineOperand &MO = MBBI->getOperand(0);
       unsigned Reg = MO.getReg();
-      bool isKill = MO.isDef() ? false : MO.isKill();
-      unsigned Base = MBBI->getOperand(1).getReg();
+      unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
       unsigned PredReg = 0;
       ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
       int Offset = getMemoryOpOffset(MBBI);
-      // Watch out for:
-      // r4 := ldr [r5]
-      // r5 := ldr [r5, #4]
-      // r6 := ldr [r5, #8]
-      //
-      // The second ldr has effectively broken the chain even though it
-      // looks like the later ldr(s) use the same base register. Try to
-      // merge the ldr's so far, including this one. But don't try to
-      // combine the following ldr(s).
-      bool Clobber = isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg();
-
-      // Watch out for:
-      // r4 := ldr [r0, #8]
-      // r4 := ldr [r0, #4]
-      //
-      // The optimization may reorder the second ldr in front of the first
-      // ldr, which violates write after write(WAW) dependence. The same as
-      // str. Try to merge inst(s) already in MemOps.
-      bool Overlap = false;
-      for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) {
-        if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) {
-          Overlap = true;
-          break;
-        }
-      }
-
-      if (CurrBase == 0 && !Clobber) {
+      if (CurrBase == 0) {
         // Start of a new chain.
         CurrBase = Base;
         CurrOpc  = Opcode;
         CurrSize = Size;
         CurrPred = Pred;
         CurrPredReg = PredReg;
-        MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
-        ++NumMemOps;
-        Advance = true;
-      } else if (!Overlap) {
-        if (Clobber) {
-          TryMerge = true;
-          Advance = true;
+        MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+        continue;
+      }
+      // Note: No need to match PredReg in the next if.
+      if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+        // Watch out for:
+        //   r4 := ldr [r0, #8]
+        //   r4 := ldr [r0, #4]
+        // or
+        //   r0 := ldr [r0]
+        // If a load overrides the base register or a register loaded by
+        // another load in our chain, we cannot take this instruction.
+        bool Overlap = false;
+        if (isLoadSingle(Opcode)) {
+          Overlap = (Base == Reg);
+          if (!Overlap) {
+            for (const MemOpQueueEntry &E : MemOps) {
+              if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
+                Overlap = true;
+                break;
+              }
+            }
+          }
         }
 
-        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
-          // No need to match PredReg.
-          // Continue adding to the queue.
+        if (!Overlap) {
+          // Check offset and sort memory operation into the current chain.
           if (Offset > MemOps.back().Offset) {
-            MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill,
-                                             Position, MBBI));
-            ++NumMemOps;
-            Advance = true;
+            MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+            continue;
           } else {
-            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
-                 I != E; ++I) {
-              if (Offset < I->Offset) {
-                MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill,
-                                                 Position, MBBI));
-                ++NumMemOps;
-                Advance = true;
+            MemOpQueue::iterator MI, ME;
+            for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
+              if (Offset < MI->Offset) {
+                // Found a place to insert.
                 break;
-              } else if (Offset == I->Offset) {
-                // Collision! This can't be merged!
+              }
+              if (Offset == MI->Offset) {
+                // Collision, abort.
+                MI = ME;
                 break;
               }
             }
+            if (MI != MemOps.end()) {
+              MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
+              continue;
+            }
           }
         }
       }
-    }
-
-    if (MBBI->isDebugValue()) {
-      ++MBBI;
-      if (MBBI == E)
-        // Reach the end of the block, try merging the memory instructions.
-        TryMerge = true;
-    } else if (Advance) {
-      ++Position;
-      ++MBBI;
-      if (MBBI == E)
-        // Reach the end of the block, try merging the memory instructions.
-        TryMerge = true;
-    } else {
-      TryMerge = true;
-    }
 
-    if (TryMerge) {
-      if (NumMemOps > 1) {
-        // Try to find a free register to use as a new base in case it's needed.
-        // First advance to the instruction just before the start of the chain.
-        AdvanceRS(MBB, MemOps);
-
-        // Find a scratch register.
-        unsigned Scratch =
-          RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass);
-
-        // Process the load / store instructions.
-        RS->forward(std::prev(MBBI));
-
-        // Merge ops.
-        Merges.clear();
-        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
-                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
-
-        // Try folding preceding/trailing base inc/dec into the generated
-        // LDM/STM ops.
-        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
-          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
-            ++NumMerges;
-        NumMerges += Merges.size();
-
-        // Try folding preceding/trailing base inc/dec into those load/store
-        // that were not merged to form LDM/STM ops.
-        for (unsigned i = 0; i != NumMemOps; ++i)
-          if (!MemOps[i].Merged)
-            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
-              ++NumMerges;
-
-        // RS may be pointing to an instruction that's deleted.
-        RS->skipTo(std::prev(MBBI));
-      } else if (NumMemOps == 1) {
-        // Try folding preceding/trailing base inc/dec into the single
-        // load/store.
-        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
-          ++NumMerges;
-          RS->forward(std::prev(MBBI));
-        }
-      }
+      // Don't advance the iterator; The op will start a new chain next.
+      MBBI = I;
+      --Position;
+      // Fallthrough to look into existing chain.
+    } else if (MBBI->isDebugValue())
+      continue;
 
+    // If we are here then the chain is broken; Extract candidates for a merge.
+    if (MemOps.size() > 0) {
+      FormCandidates(MemOps);
+      // Reset for the next chain.
       CurrBase = 0;
       CurrOpc = ~0u;
       CurrSize = 0;
       CurrPred = ARMCC::AL;
       CurrPredReg = 0;
-      if (NumMemOps) {
-        MemOps.clear();
-        NumMemOps = 0;
-      }
+      MemOps.clear();
+    }
+  }
+  if (MemOps.size() > 0)
+    FormCandidates(MemOps);
 
-      // If iterator hasn't been advanced and this is not a memory op, skip it.
-      // It can't start a new chain anyway.
-      if (!Advance && !isMemOp && MBBI != E) {
-        ++Position;
-        ++MBBI;
+  // Sort candidates so they get processed from end to begin of the basic
+  // block later; This is necessary for liveness calculation.
+  auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
+    return M0->InsertPos < M1->InsertPos;
+  };
+  std::sort(Candidates.begin(), Candidates.end(), LessThan);
+
+  // Go through list of candidates and merge.
+  bool Changed = false;
+  for (const MergeCandidate *Candidate : Candidates) {
+    if (Candidate->Instrs.size() > 1) {
+      MachineInstr *Merged = MergeOpsUpdate(*Candidate);
+      // Merge preceding/trailing base inc/dec into the merged op.
+      if (Merged) {
+        MergeBaseUpdateLSMultiple(Merged);
+        Changed = true;
+      } else {
+        for (MachineInstr *MI : Candidate->Instrs) {
+          if (MergeBaseUpdateLoadStore(MI))
+            Changed = true;
+        }
       }
+    } else {
+      assert(Candidate->Instrs.size() == 1);
+      if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
+        Changed = true;
     }
   }
-  return NumMerges > 0;
+  Candidates.clear();
+
+  return Changed;
 }
 
 /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
@@ -1814,12 +1742,14 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM
 }
 
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  MF = &Fn;
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TL = STI->getTargetLowering();
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
-  RS = new RegScavenger();
+  MRI = &Fn.getRegInfo();
+  RegClassInfoValid = false;
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
@@ -1832,7 +1762,7 @@ bool ARMLoadStoreOpt::runOnMachineFuncti
       Modified |= MergeReturnIntoLDM(MBB);
   }
 
-  delete RS;
+  Allocator.Reset();
   return Modified;
 }
 
@@ -2219,7 +2149,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadS
         continue;
 
       int Opc = MI->getOpcode();
-      bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+      bool isLd = isLoadSingle(Opc);
       unsigned Base = MI->getOperand(1).getReg();
       int Offset = getMemoryOpOffset(MI);
 

Modified: llvm/trunk/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll?rev=241920&r1=241919&r2=241920&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll Fri Jul 10 13:08:49 2015
@@ -9,8 +9,8 @@ define void @foo(%struct4bytes* byval %p
 ) {
 ;CHECK:  sub  sp, sp, #16
 ;CHECK:  push  {r11, lr}
-;CHECK:  add  r11, sp, #8
-;CHECK:  stm  r11, {r0, r1, r2, r3}
+;CHECK:  add  r12, sp, #8
+;CHECK:  stm  r12, {r0, r1, r2, r3}
 ;CHECK:  add  r0, sp, #12
 ;CHECK:  bl  useInt
 ;CHECK:  pop  {r11, lr}