[llvm] r265547 - Recommit r265309 after fixed an invalid memory reference bug happened

Wei Mi via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 6 08:41:08 PDT 2016


Author: wmi
Date: Wed Apr  6 10:41:07 2016
New Revision: 265547

URL: http://llvm.org/viewvc/llvm-project?rev=265547&view=rev
Log:
Recommit r265309 after fixed an invalid memory reference bug happened
when DenseMap growed and moved memory. I verified it fixed the bootstrap
problem on x86_64-linux-gnu but I cannot verify whether it fixes
the bootstrap error on clang-ppc64be-linux. I will watch the build-bot
result closely.

Replace analyzeSiblingValues with new algorithm to fix its compile
time issue. The patch is to solve PR17409 and its duplicates.

analyzeSiblingValues is a N x N complexity algorithm where N is
the number of siblings generated by reg splitting. Although it
causes siginificant compile time issue when N is large, it is also
important for performance since it removes redundent spills and
enables rematerialization.

To solve the compile time issue, the patch removes analyzeSiblingValues
and replaces it with lower cost alternatives containing two parts. The
first part creates a new spill hoisting method in postOptimization of
register allocation. It does spill hoisting at once after all the spills
are generated instead of inside every instance of selectOrSplit. The
second part queries the define expr of the original register for
rematerializaiton and keep it always available during register allocation
even if it is already dead. It deletes those dead instructions only in
postOptimization. With the two parts in the patch, it can remove
analyzeSiblingValues without sacrificing performance.

Differential Revision: http://reviews.llvm.org/D15302

Added:
    llvm/trunk/test/CodeGen/X86/hoist-spill.ll
    llvm/trunk/test/CodeGen/X86/new-remat.ll
Removed:
    llvm/trunk/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
Modified:
    llvm/trunk/include/llvm/CodeGen/LiveRangeEdit.h
    llvm/trunk/lib/CodeGen/InlineSpiller.cpp
    llvm/trunk/lib/CodeGen/LiveRangeEdit.cpp
    llvm/trunk/lib/CodeGen/RegAllocBase.cpp
    llvm/trunk/lib/CodeGen/RegAllocBase.h
    llvm/trunk/lib/CodeGen/RegAllocBasic.cpp
    llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp
    llvm/trunk/lib/CodeGen/RegAllocPBQP.cpp
    llvm/trunk/lib/CodeGen/Spiller.h
    llvm/trunk/lib/CodeGen/SplitKit.cpp
    llvm/trunk/lib/CodeGen/SplitKit.h
    llvm/trunk/test/CodeGen/X86/fp128-compare.ll
    llvm/trunk/test/CodeGen/X86/ragreedy-hoist-spill.ll

Modified: llvm/trunk/include/llvm/CodeGen/LiveRangeEdit.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/LiveRangeEdit.h?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/LiveRangeEdit.h (original)
+++ llvm/trunk/include/llvm/CodeGen/LiveRangeEdit.h Wed Apr  6 10:41:07 2016
@@ -72,6 +72,10 @@ private:
   /// ScannedRemattable - true when remattable values have been identified.
   bool ScannedRemattable;
 
+  /// DeadRemats - The saved instructions which have already been dead after
+  /// rematerialization but not deleted yet -- to be done in postOptimization.
+  SmallPtrSet<MachineInstr *, 32> *DeadRemats;
+
   /// Remattable - Values defined by remattable instructions as identified by
   /// tii.isTriviallyReMaterializable().
   SmallPtrSet<const VNInfo*,4> Remattable;
@@ -116,13 +120,16 @@ public:
   /// @param vrm Map of virtual registers to physical registers for this
   ///            function.  If NULL, no virtual register map updates will
   ///            be done.  This could be the case if called before Regalloc.
+  /// @param deadRemats The collection of all the instructions defining an
+  ///                   original reg and are dead after remat.
   LiveRangeEdit(LiveInterval *parent, SmallVectorImpl<unsigned> &newRegs,
                 MachineFunction &MF, LiveIntervals &lis, VirtRegMap *vrm,
-                Delegate *delegate = nullptr)
+                Delegate *delegate = nullptr,
+                SmallPtrSet<MachineInstr *, 32> *deadRemats = nullptr)
       : Parent(parent), NewRegs(newRegs), MRI(MF.getRegInfo()), LIS(lis),
-        VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()),
-        TheDelegate(delegate), FirstNew(newRegs.size()),
-        ScannedRemattable(false) {
+        VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()), TheDelegate(delegate),
+        FirstNew(newRegs.size()), ScannedRemattable(false),
+        DeadRemats(deadRemats) {
     MRI.setDelegate(this);
   }
 
@@ -142,6 +149,16 @@ public:
   bool empty() const { return size() == 0; }
   unsigned get(unsigned idx) const { return NewRegs[idx+FirstNew]; }
 
+  /// pop_back - It allows LiveRangeEdit users to drop new registers.
+  /// The context is when an original def instruction of a register is
+  /// dead after rematerialization, we still want to keep it for following
+  /// rematerializations. We save the def instruction in DeadRemats,
+  /// and replace the original dst register with a new dummy register so
+  /// the live range of original dst register can be shrinked normally.
+  /// We don't want to allocate phys register for the dummy register, so
+  /// we want to drop it from the NewRegs set.
+  void pop_back() { NewRegs.pop_back(); }
+
   ArrayRef<unsigned> regs() const {
     return makeArrayRef(NewRegs).slice(FirstNew);
   }
@@ -175,15 +192,15 @@ public:
   /// Remat - Information needed to rematerialize at a specific location.
   struct Remat {
     VNInfo *ParentVNI;      // parent_'s value at the remat location.
-    MachineInstr *OrigMI;   // Instruction defining ParentVNI.
+    MachineInstr *OrigMI;   // Instruction defining OrigVNI. It contains the
+                            // real expr for remat.
     explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI), OrigMI(nullptr) {}
   };
 
   /// canRematerializeAt - Determine if ParentVNI can be rematerialized at
   /// UseIdx. It is assumed that parent_.getVNINfoAt(UseIdx) == ParentVNI.
   /// When cheapAsAMove is set, only cheap remats are allowed.
-  bool canRematerializeAt(Remat &RM,
-                          SlotIndex UseIdx,
+  bool canRematerializeAt(Remat &RM, VNInfo *OrigVNI, SlotIndex UseIdx,
                           bool cheapAsAMove);
 
   /// rematerializeAt - Rematerialize RM.ParentVNI into DestReg by inserting an
@@ -208,6 +225,12 @@ public:
     return Rematted.count(ParentVNI);
   }
 
+  void markDeadRemat(MachineInstr *inst) {
+    // DeadRemats is an optional field.
+    if (DeadRemats)
+      DeadRemats->insert(inst);
+  }
+
   /// eraseVirtReg - Notify the delegate that Reg is no longer in use, and try
   /// to erase it from LIS.
   void eraseVirtReg(unsigned Reg);
@@ -218,8 +241,11 @@ public:
   /// RegsBeingSpilled lists registers currently being spilled by the register
   /// allocator.  These registers should not be split into new intervals
   /// as currently those new intervals are not guaranteed to spill.
-  void eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                         ArrayRef<unsigned> RegsBeingSpilled = None);
+  /// NoSplit indicates this func is used after the iterations of selectOrSplit
+  /// where registers should not be split into new intervals.
+  void eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                         ArrayRef<unsigned> RegsBeingSpilled = None,
+                         bool NoSplit = false);
 
   /// calculateRegClassAndHint - Recompute register class and hint for each new
   /// register.

Modified: llvm/trunk/lib/CodeGen/InlineSpiller.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/InlineSpiller.cpp?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/InlineSpiller.cpp (original)
+++ llvm/trunk/lib/CodeGen/InlineSpiller.cpp Wed Apr  6 10:41:07 2016
@@ -48,13 +48,77 @@ STATISTIC(NumReloadsRemoved,  "Number of
 STATISTIC(NumFolded,          "Number of folded stack accesses");
 STATISTIC(NumFoldedLoads,     "Number of folded loads");
 STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
-STATISTIC(NumOmitReloadSpill, "Number of omitted spills of reloads");
-STATISTIC(NumHoists,          "Number of hoisted spills");
 
 static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
                                      cl::desc("Disable inline spill hoisting"));
 
 namespace {
+class HoistSpillHelper {
+  LiveIntervals &LIS;
+  LiveStacks &LSS;
+  AliasAnalysis *AA;
+  MachineDominatorTree &MDT;
+  MachineLoopInfo &Loops;
+  VirtRegMap &VRM;
+  MachineFrameInfo &MFI;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+
+  // Map from StackSlot to its original register.
+  DenseMap<int, unsigned> StackSlotToReg;
+  // Map from pair of (StackSlot and Original VNI) to a set of spills which
+  // have the same stackslot and have equal values defined by Original VNI.
+  // These spills are mergeable and are hoist candiates.
+  typedef DenseMap<std::pair<int, VNInfo *>, SmallPtrSet<MachineInstr *, 16>>
+      MergeableSpillsMap;
+  MergeableSpillsMap MergeableSpills;
+
+  /// This is the map from original register to a set containing all its
+  /// siblings. To hoist a spill to another BB, we need to find out a live
+  /// sibling there and use it as the source of the new spill.
+  DenseMap<unsigned, SmallSetVector<unsigned, 16>> Virt2SiblingsMap;
+
+  bool isSpillCandBB(unsigned OrigReg, VNInfo &OrigVNI, MachineBasicBlock &BB,
+                     unsigned &LiveReg);
+
+  void rmRedundantSpills(
+      SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+
+  void getVisitOrders(
+      MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineDomTreeNode *> &Orders,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKeep,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+
+  void runHoistSpills(unsigned OrigReg, VNInfo &OrigVNI,
+                      SmallPtrSet<MachineInstr *, 16> &Spills,
+                      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+                      DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns);
+
+public:
+  HoistSpillHelper(MachineFunctionPass &pass, MachineFunction &mf,
+                   VirtRegMap &vrm)
+      : LIS(pass.getAnalysis<LiveIntervals>()),
+        LSS(pass.getAnalysis<LiveStacks>()),
+        AA(&pass.getAnalysis<AAResultsWrapperPass>().getAAResults()),
+        MDT(pass.getAnalysis<MachineDominatorTree>()),
+        Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
+        MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
+        TII(*mf.getSubtarget().getInstrInfo()),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
+
+  void addToMergeableSpills(MachineInstr *Spill, int StackSlot,
+                            unsigned Original);
+  bool rmFromMergeableSpills(MachineInstr *Spill, int StackSlot);
+  void hoistAllSpills(LiveRangeEdit &Edit);
+};
+
 class InlineSpiller : public Spiller {
   MachineFunction &MF;
   LiveIntervals &LIS;
@@ -85,56 +149,12 @@ class InlineSpiller : public Spiller {
   // Values that failed to remat at some point.
   SmallPtrSet<VNInfo*, 8> UsedValues;
 
-public:
-  // Information about a value that was defined by a copy from a sibling
-  // register.
-  struct SibValueInfo {
-    // True when all reaching defs were reloads: No spill is necessary.
-    bool AllDefsAreReloads;
-
-    // True when value is defined by an original PHI not from splitting.
-    bool DefByOrigPHI;
-
-    // True when the COPY defining this value killed its source.
-    bool KillsSource;
-
-    // The preferred register to spill.
-    unsigned SpillReg;
-
-    // The value of SpillReg that should be spilled.
-    VNInfo *SpillVNI;
-
-    // The block where SpillVNI should be spilled. Currently, this must be the
-    // block containing SpillVNI->def.
-    MachineBasicBlock *SpillMBB;
-
-    // A defining instruction that is not a sibling copy or a reload, or NULL.
-    // This can be used as a template for rematerialization.
-    MachineInstr *DefMI;
-
-    // List of values that depend on this one.  These values are actually the
-    // same, but live range splitting has placed them in different registers,
-    // or SSA update needed to insert PHI-defs to preserve SSA form.  This is
-    // copies of the current value and phi-kills.  Usually only phi-kills cause
-    // more than one dependent value.
-    TinyPtrVector<VNInfo*> Deps;
-
-    SibValueInfo(unsigned Reg, VNInfo *VNI)
-      : AllDefsAreReloads(true), DefByOrigPHI(false), KillsSource(false),
-        SpillReg(Reg), SpillVNI(VNI), SpillMBB(nullptr), DefMI(nullptr) {}
-
-    // Returns true when a def has been found.
-    bool hasDef() const { return DefByOrigPHI || DefMI; }
-  };
-
-private:
-  // Values in RegsToSpill defined by sibling copies.
-  typedef DenseMap<VNInfo*, SibValueInfo> SibValueMap;
-  SibValueMap SibValues;
-
   // Dead defs generated during spilling.
   SmallVector<MachineInstr*, 8> DeadDefs;
 
+  // Object records spills information and does the hoisting.
+  HoistSpillHelper HSpiller;
+
   ~InlineSpiller() override {}
 
 public:
@@ -147,9 +167,11 @@ public:
         MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
         TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()),
-        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()),
+        HSpiller(pass, mf, vrm) {}
 
   void spill(LiveRangeEdit &) override;
+  void postOptimization() override;
 
 private:
   bool isSnippet(const LiveInterval &SnipLI);
@@ -161,11 +183,7 @@ private:
   }
 
   bool isSibling(unsigned Reg);
-  MachineInstr *traceSiblingValue(unsigned, VNInfo*, VNInfo*);
-  void propagateSiblingValue(SibValueMap::iterator, VNInfo *VNI = nullptr);
-  void analyzeSiblingValues();
-
-  bool hoistSpill(LiveInterval &SpillLI, MachineInstr &CopyMI);
+  bool hoistSpillInsideBB(LiveInterval &SpillLI, MachineInstr &CopyMI);
   void eliminateRedundantSpills(LiveInterval &LI, VNInfo *VNI);
 
   void markValueUsed(LiveInterval*, VNInfo*);
@@ -297,417 +315,45 @@ void InlineSpiller::collectRegsToSpill()
   }
 }
 
-
-//===----------------------------------------------------------------------===//
-//                            Sibling Values
-//===----------------------------------------------------------------------===//
-
-// After live range splitting, some values to be spilled may be defined by
-// copies from sibling registers. We trace the sibling copies back to the
-// original value if it still exists. We need it for rematerialization.
-//
-// Even when the value can't be rematerialized, we still want to determine if
-// the value has already been spilled, or we may want to hoist the spill from a
-// loop.
-
 bool InlineSpiller::isSibling(unsigned Reg) {
   return TargetRegisterInfo::isVirtualRegister(Reg) &&
            VRM.getOriginal(Reg) == Original;
 }
 
-#ifndef NDEBUG
-static raw_ostream &operator<<(raw_ostream &OS,
-                               const InlineSpiller::SibValueInfo &SVI) {
-  OS << "spill " << PrintReg(SVI.SpillReg) << ':'
-     << SVI.SpillVNI->id << '@' << SVI.SpillVNI->def;
-  if (SVI.SpillMBB)
-    OS << " in BB#" << SVI.SpillMBB->getNumber();
-  if (SVI.AllDefsAreReloads)
-    OS << " all-reloads";
-  if (SVI.DefByOrigPHI)
-    OS << " orig-phi";
-  if (SVI.KillsSource)
-    OS << " kill";
-  OS << " deps[";
-  for (VNInfo *Dep : SVI.Deps)
-    OS << ' ' << Dep->id << '@' << Dep->def;
-  OS << " ]";
-  if (SVI.DefMI)
-    OS << " def: " << *SVI.DefMI;
-  else
-    OS << '\n';
-  return OS;
-}
-#endif
-
-/// propagateSiblingValue - Propagate the value in SVI to dependents if it is
-/// known.  Otherwise remember the dependency for later.
+/// It is beneficial to spill to earlier place in the same BB in case
+/// as follows:
+/// There is an alternative def earlier in the same MBB.
+/// Hoist the spill as far as possible in SpillMBB. This can ease
+/// register pressure:
 ///
-/// @param SVIIter SibValues entry to propagate.
-/// @param VNI Dependent value, or NULL to propagate to all saved dependents.
-void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVIIter,
-                                          VNInfo *VNI) {
-  SibValueMap::value_type *SVI = &*SVIIter;
-
-  // When VNI is non-NULL, add it to SVI's deps, and only propagate to that.
-  TinyPtrVector<VNInfo*> FirstDeps;
-  if (VNI) {
-    FirstDeps.push_back(VNI);
-    SVI->second.Deps.push_back(VNI);
-  }
-
-  // Has the value been completely determined yet?  If not, defer propagation.
-  if (!SVI->second.hasDef())
-    return;
-
-  // Work list of values to propagate.
-  SmallSetVector<SibValueMap::value_type *, 8> WorkList;
-  WorkList.insert(SVI);
-
-  do {
-    SVI = WorkList.pop_back_val();
-    TinyPtrVector<VNInfo*> *Deps = VNI ? &FirstDeps : &SVI->second.Deps;
-    VNI = nullptr;
-
-    SibValueInfo &SV = SVI->second;
-    if (!SV.SpillMBB)
-      SV.SpillMBB = LIS.getMBBFromIndex(SV.SpillVNI->def);
-
-    DEBUG(dbgs() << "  prop to " << Deps->size() << ": "
-                 << SVI->first->id << '@' << SVI->first->def << ":\t" << SV);
-
-    assert(SV.hasDef() && "Propagating undefined value");
-
-    // Should this value be propagated as a preferred spill candidate?  We don't
-    // propagate values of registers that are about to spill.
-    bool PropSpill = !DisableHoisting && !isRegToSpill(SV.SpillReg);
-    unsigned SpillDepth = ~0u;
-
-    for (VNInfo *Dep : *Deps) {
-      SibValueMap::iterator DepSVI = SibValues.find(Dep);
-      assert(DepSVI != SibValues.end() && "Dependent value not in SibValues");
-      SibValueInfo &DepSV = DepSVI->second;
-      if (!DepSV.SpillMBB)
-        DepSV.SpillMBB = LIS.getMBBFromIndex(DepSV.SpillVNI->def);
-
-      bool Changed = false;
-
-      // Propagate defining instruction.
-      if (!DepSV.hasDef()) {
-        Changed = true;
-        DepSV.DefMI = SV.DefMI;
-        DepSV.DefByOrigPHI = SV.DefByOrigPHI;
-      }
-
-      // Propagate AllDefsAreReloads.  For PHI values, this computes an AND of
-      // all predecessors.
-      if (!SV.AllDefsAreReloads && DepSV.AllDefsAreReloads) {
-        Changed = true;
-        DepSV.AllDefsAreReloads = false;
-      }
-
-      // Propagate best spill value.
-      if (PropSpill && SV.SpillVNI != DepSV.SpillVNI) {
-        if (SV.SpillMBB == DepSV.SpillMBB) {
-          // DepSV is in the same block.  Hoist when dominated.
-          if (DepSV.KillsSource && SV.SpillVNI->def < DepSV.SpillVNI->def) {
-            // This is an alternative def earlier in the same MBB.
-            // Hoist the spill as far as possible in SpillMBB. This can ease
-            // register pressure:
-            //
-            //   x = def
-            //   y = use x
-            //   s = copy x
-            //
-            // Hoisting the spill of s to immediately after the def removes the
-            // interference between x and y:
-            //
-            //   x = def
-            //   spill x
-            //   y = use x<kill>
-            //
-            // This hoist only helps when the DepSV copy kills its source.
-            Changed = true;
-            DepSV.SpillReg = SV.SpillReg;
-            DepSV.SpillVNI = SV.SpillVNI;
-            DepSV.SpillMBB = SV.SpillMBB;
-          }
-        } else {
-          // DepSV is in a different block.
-          if (SpillDepth == ~0u)
-            SpillDepth = Loops.getLoopDepth(SV.SpillMBB);
-
-          // Also hoist spills to blocks with smaller loop depth, but make sure
-          // that the new value dominates.  Non-phi dependents are always
-          // dominated, phis need checking.
-
-          const BranchProbability MarginProb(4, 5); // 80%
-          // Hoist a spill to outer loop if there are multiple dependents (it
-          // can be beneficial if more than one dependents are hoisted) or
-          // if DepSV (the hoisting source) is hotter than SV (the hoisting
-          // destination) (we add a 80% margin to bias a little towards
-          // loop depth).
-          bool HoistCondition =
-            (MBFI.getBlockFreq(DepSV.SpillMBB) >=
-             (MBFI.getBlockFreq(SV.SpillMBB) * MarginProb)) ||
-            Deps->size() > 1;
-
-          if ((Loops.getLoopDepth(DepSV.SpillMBB) > SpillDepth) &&
-              HoistCondition &&
-              (!DepSVI->first->isPHIDef() ||
-               MDT.dominates(SV.SpillMBB, DepSV.SpillMBB))) {
-            Changed = true;
-            DepSV.SpillReg = SV.SpillReg;
-            DepSV.SpillVNI = SV.SpillVNI;
-            DepSV.SpillMBB = SV.SpillMBB;
-          }
-        }
-      }
-
-      if (!Changed)
-        continue;
-
-      // Something changed in DepSVI. Propagate to dependents.
-      WorkList.insert(&*DepSVI);
-
-      DEBUG(dbgs() << "  update " << DepSVI->first->id << '@'
-            << DepSVI->first->def << " to:\t" << DepSV);
-    }
-  } while (!WorkList.empty());
-}
-
-/// traceSiblingValue - Trace a value that is about to be spilled back to the
-/// real defining instructions by looking through sibling copies. Always stay
-/// within the range of OrigVNI so the registers are known to carry the same
-/// value.
+///   x = def
+///   y = use x
+///   s = copy x
 ///
-/// Determine if the value is defined by all reloads, so spilling isn't
-/// necessary - the value is already in the stack slot.
+/// Hoisting the spill of s to immediately after the def removes the
+/// interference between x and y:
 ///
-/// Return a defining instruction that may be a candidate for rematerialization.
+///   x = def
+///   spill x
+///   y = use x<kill>
 ///
-MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
-                                               VNInfo *OrigVNI) {
-  // Check if a cached value already exists.
-  SibValueMap::iterator SVI;
-  bool Inserted;
-  std::tie(SVI, Inserted) =
-    SibValues.insert(std::make_pair(UseVNI, SibValueInfo(UseReg, UseVNI)));
-  if (!Inserted) {
-    DEBUG(dbgs() << "Cached value " << PrintReg(UseReg) << ':'
-                 << UseVNI->id << '@' << UseVNI->def << ' ' << SVI->second);
-    return SVI->second.DefMI;
-  }
-
-  DEBUG(dbgs() << "Tracing value " << PrintReg(UseReg) << ':'
-               << UseVNI->id << '@' << UseVNI->def << '\n');
-
-  // List of (Reg, VNI) that have been inserted into SibValues, but need to be
-  // processed.
-  SmallVector<std::pair<unsigned, VNInfo*>, 8> WorkList;
-  WorkList.push_back(std::make_pair(UseReg, UseVNI));
-
-  LiveInterval &OrigLI = LIS.getInterval(Original);
-  do {
-    unsigned Reg;
-    VNInfo *VNI;
-    std::tie(Reg, VNI) = WorkList.pop_back_val();
-    DEBUG(dbgs() << "  " << PrintReg(Reg) << ':' << VNI->id << '@' << VNI->def
-                 << ":\t");
-
-    // First check if this value has already been computed.
-    SVI = SibValues.find(VNI);
-    assert(SVI != SibValues.end() && "Missing SibValues entry");
-
-    // Trace through PHI-defs created by live range splitting.
-    if (VNI->isPHIDef()) {
-      // Stop at original PHIs.  We don't know the value at the
-      // predecessors. Look up the VNInfo for the current definition
-      // in OrigLI, to properly determine whether or not this phi was
-      // added by splitting.
-      if (VNI->def == OrigLI.getVNInfoAt(VNI->def)->def) {
-        DEBUG(dbgs() << "orig phi value\n");
-        SVI->second.DefByOrigPHI = true;
-        SVI->second.AllDefsAreReloads = false;
-        propagateSiblingValue(SVI);
-        continue;
-      }
-
-      // This is a PHI inserted by live range splitting.  We could trace the
-      // live-out value from predecessor blocks, but that search can be very
-      // expensive if there are many predecessors and many more PHIs as
-      // generated by tail-dup when it sees an indirectbr.  Instead, look at
-      // all the non-PHI defs that have the same value as OrigVNI.  They must
-      // jointly dominate VNI->def.  This is not optimal since VNI may actually
-      // be jointly dominated by a smaller subset of defs, so there is a change
-      // we will miss a AllDefsAreReloads optimization.
-
-      // Separate all values dominated by OrigVNI into PHIs and non-PHIs.
-      SmallVector<VNInfo*, 8> PHIs, NonPHIs;
-      LiveInterval &LI = LIS.getInterval(Reg);
-
-      for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end();
-           VI != VE; ++VI) {
-        VNInfo *VNI2 = *VI;
-        if (VNI2->isUnused())
-          continue;
-        if (!OrigLI.containsOneValue() &&
-            OrigLI.getVNInfoAt(VNI2->def) != OrigVNI)
-          continue;
-        if (VNI2->isPHIDef() && VNI2->def != OrigVNI->def)
-          PHIs.push_back(VNI2);
-        else
-          NonPHIs.push_back(VNI2);
-      }
-      DEBUG(dbgs() << "split phi value, checking " << PHIs.size()
-                   << " phi-defs, and " << NonPHIs.size()
-                   << " non-phi/orig defs\n");
-
-      // Create entries for all the PHIs.  Don't add them to the worklist, we
-      // are processing all of them in one go here.
-      for (VNInfo *PHI : PHIs)
-        SibValues.insert(std::make_pair(PHI, SibValueInfo(Reg, PHI)));
-
-      // Add every PHI as a dependent of all the non-PHIs.
-      for (VNInfo *NonPHI : NonPHIs) {
-        // Known value? Try an insertion.
-        std::tie(SVI, Inserted) =
-          SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI)));
-        // Add all the PHIs as dependents of NonPHI.
-        SVI->second.Deps.insert(SVI->second.Deps.end(), PHIs.begin(),
-                                PHIs.end());
-        // This is the first time we see NonPHI, add it to the worklist.
-        if (Inserted)
-          WorkList.push_back(std::make_pair(Reg, NonPHI));
-        else
-          // Propagate to all inserted PHIs, not just VNI.
-          propagateSiblingValue(SVI);
-      }
-
-      // Next work list item.
-      continue;
-    }
-
-    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
-    assert(MI && "Missing def");
-
-    // Trace through sibling copies.
-    if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
-      if (isSibling(SrcReg)) {
-        LiveInterval &SrcLI = LIS.getInterval(SrcReg);
-        LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
-        assert(SrcQ.valueIn() && "Copy from non-existing value");
-        // Check if this COPY kills its source.
-        SVI->second.KillsSource = SrcQ.isKill();
-        VNInfo *SrcVNI = SrcQ.valueIn();
-        DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':'
-                     << SrcVNI->id << '@' << SrcVNI->def
-                     << " kill=" << unsigned(SVI->second.KillsSource) << '\n');
-        // Known sibling source value? Try an insertion.
-        std::tie(SVI, Inserted) = SibValues.insert(
-            std::make_pair(SrcVNI, SibValueInfo(SrcReg, SrcVNI)));
-        // This is the first time we see Src, add it to the worklist.
-        if (Inserted)
-          WorkList.push_back(std::make_pair(SrcReg, SrcVNI));
-        propagateSiblingValue(SVI, VNI);
-        // Next work list item.
-        continue;
-      }
-    }
-
-    // Track reachable reloads.
-    SVI->second.DefMI = MI;
-    SVI->second.SpillMBB = MI->getParent();
-    int FI;
-    if (Reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot) {
-      DEBUG(dbgs() << "reload\n");
-      propagateSiblingValue(SVI);
-      // Next work list item.
-      continue;
-    }
-
-    // Potential remat candidate.
-    DEBUG(dbgs() << "def " << *MI);
-    SVI->second.AllDefsAreReloads = false;
-    propagateSiblingValue(SVI);
-  } while (!WorkList.empty());
-
-  // Look up the value we were looking for.  We already did this lookup at the
-  // top of the function, but SibValues may have been invalidated.
-  SVI = SibValues.find(UseVNI);
-  assert(SVI != SibValues.end() && "Didn't compute requested info");
-  DEBUG(dbgs() << "  traced to:\t" << SVI->second);
-  return SVI->second.DefMI;
-}
-
-/// analyzeSiblingValues - Trace values defined by sibling copies back to
-/// something that isn't a sibling copy.
+/// This hoist only helps when the copy kills its source.
 ///
-/// Keep track of values that may be rematerializable.
-void InlineSpiller::analyzeSiblingValues() {
-  SibValues.clear();
-
-  // No siblings at all?
-  if (Edit->getReg() == Original)
-    return;
-
-  LiveInterval &OrigLI = LIS.getInterval(Original);
-  for (unsigned Reg : RegsToSpill) {
-    LiveInterval &LI = LIS.getInterval(Reg);
-    for (LiveInterval::const_vni_iterator VI = LI.vni_begin(),
-         VE = LI.vni_end(); VI != VE; ++VI) {
-      VNInfo *VNI = *VI;
-      if (VNI->isUnused())
-        continue;
-      MachineInstr *DefMI = nullptr;
-      if (!VNI->isPHIDef()) {
-       DefMI = LIS.getInstructionFromIndex(VNI->def);
-       assert(DefMI && "No defining instruction");
-      }
-      // Check possible sibling copies.
-      if (VNI->isPHIDef() || DefMI->isCopy()) {
-        VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
-        assert(OrigVNI && "Def outside original live range");
-        if (OrigVNI->def != VNI->def)
-          DefMI = traceSiblingValue(Reg, VNI, OrigVNI);
-      }
-      if (DefMI && Edit->checkRematerializable(VNI, DefMI, AA)) {
-        DEBUG(dbgs() << "Value " << PrintReg(Reg) << ':' << VNI->id << '@'
-                     << VNI->def << " may remat from " << *DefMI);
-      }
-    }
-  }
-}
-
-/// hoistSpill - Given a sibling copy that defines a value to be spilled, insert
-/// a spill at a better location.
-bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr &CopyMI) {
+bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
+                                       MachineInstr &CopyMI) {
   SlotIndex Idx = LIS.getInstructionIndex(CopyMI);
+#ifndef NDEBUG
   VNInfo *VNI = SpillLI.getVNInfoAt(Idx.getRegSlot());
   assert(VNI && VNI->def == Idx.getRegSlot() && "Not defined by copy");
-  SibValueMap::iterator I = SibValues.find(VNI);
-  if (I == SibValues.end())
-    return false;
-
-  const SibValueInfo &SVI = I->second;
-
-  // Let the normal folding code deal with the boring case.
-  if (!SVI.AllDefsAreReloads && SVI.SpillVNI == VNI)
-    return false;
-
-  // SpillReg may have been deleted by remat and DCE.
-  if (!LIS.hasInterval(SVI.SpillReg)) {
-    DEBUG(dbgs() << "Stale interval: " << PrintReg(SVI.SpillReg) << '\n');
-    SibValues.erase(I);
-    return false;
-  }
+#endif
 
-  LiveInterval &SibLI = LIS.getInterval(SVI.SpillReg);
-  if (!SibLI.containsValue(SVI.SpillVNI)) {
-    DEBUG(dbgs() << "Stale value: " << PrintReg(SVI.SpillReg) << '\n');
-    SibValues.erase(I);
+  unsigned SrcReg = CopyMI.getOperand(1).getReg();
+  LiveInterval &SrcLI = LIS.getInterval(SrcReg);
+  VNInfo *SrcVNI = SrcLI.getVNInfoAt(Idx);
+  LiveQueryResult SrcQ = SrcLI.Query(Idx);
+  MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(SrcVNI->def);
+  if (DefMBB != CopyMI.getParent() || !SrcQ.isKill())
     return false;
-  }
 
   // Conservatively extend the stack slot range to the range of the original
   // value. We may be able to do better with stack slot coloring by being more
@@ -719,35 +365,29 @@ bool InlineSpiller::hoistSpill(LiveInter
   DEBUG(dbgs() << "\tmerged orig valno " << OrigVNI->id << ": "
                << *StackInt << '\n');
 
-  // Already spilled everywhere.
-  if (SVI.AllDefsAreReloads) {
-    DEBUG(dbgs() << "\tno spill needed: " << SVI);
-    ++NumOmitReloadSpill;
-    return true;
-  }
-  // We are going to spill SVI.SpillVNI immediately after its def, so clear out
+  // We are going to spill SrcVNI immediately after its def, so clear out
   // any later spills of the same value.
-  eliminateRedundantSpills(SibLI, SVI.SpillVNI);
+  eliminateRedundantSpills(SrcLI, SrcVNI);
 
-  MachineBasicBlock *MBB = LIS.getMBBFromIndex(SVI.SpillVNI->def);
+  MachineBasicBlock *MBB = LIS.getMBBFromIndex(SrcVNI->def);
   MachineBasicBlock::iterator MII;
-  if (SVI.SpillVNI->isPHIDef())
+  if (SrcVNI->isPHIDef())
     MII = MBB->SkipPHIsAndLabels(MBB->begin());
   else {
-    MachineInstr *DefMI = LIS.getInstructionFromIndex(SVI.SpillVNI->def);
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(SrcVNI->def);
     assert(DefMI && "Defining instruction disappeared");
     MII = DefMI;
     ++MII;
   }
   // Insert spill without kill flag immediately after def.
-  TII.storeRegToStackSlot(*MBB, MII, SVI.SpillReg, false, StackSlot,
-                          MRI.getRegClass(SVI.SpillReg), &TRI);
+  TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot,
+                          MRI.getRegClass(SrcReg), &TRI);
   --MII; // Point to store instruction.
   LIS.InsertMachineInstrInMaps(*MII);
-  DEBUG(dbgs() << "\thoisted: " << SVI.SpillVNI->def << '\t' << *MII);
+  DEBUG(dbgs() << "\thoisted: " << SrcVNI->def << '\t' << *MII);
 
+  HSpiller.addToMergeableSpills(&(*MII), StackSlot, Original);
   ++NumSpills;
-  ++NumHoists;
   return true;
 }
 
@@ -805,7 +445,8 @@ void InlineSpiller::eliminateRedundantSp
         MI->setDesc(TII.get(TargetOpcode::KILL));
         DeadDefs.push_back(MI);
         ++NumSpillsRemoved;
-        --NumSpills;
+        if (HSpiller.rmFromMergeableSpills(MI, StackSlot))
+          --NumSpills;
       }
     }
   } while (!WorkList.empty());
@@ -876,12 +517,12 @@ bool InlineSpiller::reMaterializeFor(Liv
   if (SnippetCopies.count(&MI))
     return false;
 
-  // Use an OrigVNI from traceSiblingValue when ParentVNI is a sibling copy.
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
   LiveRangeEdit::Remat RM(ParentVNI);
-  SibValueMap::const_iterator SibI = SibValues.find(ParentVNI);
-  if (SibI != SibValues.end())
-    RM.OrigMI = SibI->second.DefMI;
-  if (!Edit->canRematerializeAt(RM, UseIdx, false)) {
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
+  if (!Edit->canRematerializeAt(RM, OrigVNI, UseIdx, false)) {
     markValueUsed(&VirtReg, ParentVNI);
     DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI);
     return false;
@@ -931,7 +572,6 @@ bool InlineSpiller::reMaterializeFor(Liv
 /// reMaterializeAll - Try to rematerialize as many uses as possible,
 /// and trim the live ranges after.
 void InlineSpiller::reMaterializeAll() {
-  // analyzeSiblingValues has already tested all relevant defining instructions.
   if (!Edit->anyRematerializable(AA))
     return;
 
@@ -1017,6 +657,9 @@ bool InlineSpiller::coalesceStackAccess(
   if (InstrReg != Reg || FI != StackSlot)
     return false;
 
+  if (!IsLoad)
+    HSpiller.rmFromMergeableSpills(MI, StackSlot);
+
   DEBUG(dbgs() << "Coalescing stack access: " << *MI);
   LIS.RemoveMachineInstrFromMaps(*MI);
   MI->eraseFromParent();
@@ -1141,6 +784,9 @@ foldMemoryOperand(ArrayRef<std::pair<Mac
     LIS.removePhysRegDefAt(Reg, Idx);
   }
 
+  int FI;
+  if (TII.isStoreToStackSlot(MI, FI) && HSpiller.rmFromMergeableSpills(MI, FI))
+    --NumSpills;
   LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
   MI->eraseFromParent();
 
@@ -1166,9 +812,10 @@ foldMemoryOperand(ArrayRef<std::pair<Mac
 
   if (!WasCopy)
     ++NumFolded;
-  else if (Ops.front().second == 0)
+  else if (Ops.front().second == 0) {
     ++NumSpills;
-  else
+    HSpiller.addToMergeableSpills(FoldMI, StackSlot, Original);
+  } else
     ++NumReloads;
   return true;
 }
@@ -1203,6 +850,7 @@ void InlineSpiller::insertSpill(unsigned
   DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
                                            "spill"));
   ++NumSpills;
+  HSpiller.addToMergeableSpills(std::next(MI), StackSlot, Original);
 }
 
 /// spillAroundUses - insert spill code around each use of Reg.
@@ -1266,8 +914,7 @@ void InlineSpiller::spillAroundUses(unsi
         continue;
       }
       if (RI.Writes) {
-        // Hoist the spill of a sib-reg copy.
-        if (hoistSpill(OldLI, *MI)) {
+        if (hoistSpillInsideBB(OldLI, *MI)) {
           // This COPY is now dead, the value is already in the stack slot.
           MI->getOperand(0).setIsDead();
           DeadDefs.push_back(MI);
@@ -1380,7 +1027,6 @@ void InlineSpiller::spill(LiveRangeEdit
   assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
 
   collectRegsToSpill();
-  analyzeSiblingValues();
   reMaterializeAll();
 
   // Remat may handle everything.
@@ -1389,3 +1035,393 @@ void InlineSpiller::spill(LiveRangeEdit
 
   Edit->calculateRegClassAndHint(MF, Loops, MBFI);
 }
+
+/// Optimizations after all the reg selections and spills are done.
+///
+void InlineSpiller::postOptimization() {
+  SmallVector<unsigned, 4> NewVRegs;
+  LiveRangeEdit LRE(nullptr, NewVRegs, MF, LIS, &VRM, nullptr);
+  HSpiller.hoistAllSpills(LRE);
+  assert(NewVRegs.size() == 0 &&
+         "No new vregs should be generated in hoistAllSpills");
+}
+
+/// When a spill is inserted, add the spill to MergeableSpills map.
+///
+void HoistSpillHelper::addToMergeableSpills(MachineInstr *Spill, int StackSlot,
+                                            unsigned Original) {
+  StackSlotToReg[StackSlot] = Original;
+  SlotIndex Idx = LIS.getInstructionIndex(*Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  MergeableSpills[MIdx].insert(Spill);
+}
+
+/// When a spill is removed, remove the spill from MergeableSpills map.
+/// Return true if the spill is removed successfully.
+///
+bool HoistSpillHelper::rmFromMergeableSpills(MachineInstr *Spill,
+                                             int StackSlot) {
+  int Original = StackSlotToReg[StackSlot];
+  if (!Original)
+    return false;
+  SlotIndex Idx = LIS.getInstructionIndex(*Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  return MergeableSpills[MIdx].erase(Spill);
+}
+
+/// Check BB to see if it is a possible target BB to place a hoisted spill,
+/// i.e., there should be a living sibling of OrigReg at the insert point.
+///
+bool HoistSpillHelper::isSpillCandBB(unsigned OrigReg, VNInfo &OrigVNI,
+                                     MachineBasicBlock &BB, unsigned &LiveReg) {
+  SlotIndex Idx;
+  MachineBasicBlock::iterator MI = BB.getFirstTerminator();
+  if (MI != BB.end())
+    Idx = LIS.getInstructionIndex(*MI);
+  else
+    Idx = LIS.getMBBEndIdx(&BB).getPrevSlot();
+  SmallSetVector<unsigned, 16> &Siblings = Virt2SiblingsMap[OrigReg];
+  assert((LIS.getInterval(OrigReg)).getVNInfoAt(Idx) == &OrigVNI &&
+         "Unexpected VNI");
+
+  for (auto const SibReg : Siblings) {
+    LiveInterval &LI = LIS.getInterval(SibReg);
+    VNInfo *VNI = LI.getVNInfoAt(Idx);
+    if (VNI) {
+      LiveReg = SibReg;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Remove redundent spills in the same BB. Save those redundent spills in
+/// SpillsToRm, and save the spill to keep and its BB in SpillBBToSpill map.
+///
+void HoistSpillHelper::rmRedundantSpills(
+    SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // For each spill saw, check SpillBBToSpill[] and see if its BB already has
+  // another spill inside. If a BB contains more than one spill, only keep the
+  // earlier spill with smaller SlotIndex.
+  for (const auto CurrentSpill : Spills) {
+    MachineBasicBlock *Block = CurrentSpill->getParent();
+    MachineDomTreeNode *Node = MDT.DT->getNode(Block);
+    MachineInstr *PrevSpill = SpillBBToSpill[Node];
+    if (PrevSpill) {
+      SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill);
+      SlotIndex CIdx = LIS.getInstructionIndex(*CurrentSpill);
+      MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill;
+      MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill;
+      SpillsToRm.push_back(SpillToRm);
+      SpillBBToSpill[MDT.DT->getNode(Block)] = SpillToKeep;
+    } else {
+      SpillBBToSpill[MDT.DT->getNode(Block)] = CurrentSpill;
+    }
+  }
+  for (const auto SpillToRm : SpillsToRm)
+    Spills.erase(SpillToRm);
+}
+
+/// Starting from \p Root find a top-down traversal order of the dominator
+/// tree to visit all basic blocks containing the elements of \p Spills.
+/// Redundant spills will be found and put into \p SpillsToRm at the same
+/// time. \p SpillBBToSpill will be populated as part of the process and
+/// maps a basic block to the first store occurring in the basic block.
+/// \post SpillsToRm.union(Spills at post) == Spills at pre
+///
+void HoistSpillHelper::getVisitOrders(
+    MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineDomTreeNode *> &Orders,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKeep,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // The set contains all the possible BB nodes to which we may hoist
+  // original spills.
+  SmallPtrSet<MachineDomTreeNode *, 8> WorkSet;
+  // Save the BB nodes on the path from the first BB node containing
+  // non-redundent spill to the Root node.
+  SmallPtrSet<MachineDomTreeNode *, 8> NodesOnPath;
+  // All the spills to be hoisted must originate from a single def instruction
+  // to the OrigReg. It means the def instruction should dominate all the spills
+  // to be hoisted. We choose the BB where the def instruction is located as
+  // the Root.
+  MachineDomTreeNode *RootIDomNode = MDT[Root]->getIDom();
+  // For every node on the dominator tree with spill, walk up on the dominator
+  // tree towards the Root node until it is reached. If there is other node
+  // containing spill in the middle of the path, the previous spill saw will
+  // be redundent and the node containing it will be removed. All the nodes on
+  // the path starting from the first node with non-redundent spill to the Root
+  // node will be added to the WorkSet, which will contain all the possible
+  // locations where spills may be hoisted to after the loop below is done.
+  for (const auto Spill : Spills) {
+    MachineBasicBlock *Block = Spill->getParent();
+    MachineDomTreeNode *Node = MDT[Block];
+    MachineInstr *SpillToRm = nullptr;
+    while (Node != RootIDomNode) {
+      // If Node dominates Block, and it already contains a spill, the spill in
+      // Block will be redundent.
+      if (Node != MDT[Block] && SpillBBToSpill[Node]) {
+        SpillToRm = SpillBBToSpill[MDT[Block]];
+        break;
+        /// If we see the Node already in WorkSet, the path from the Node to
+        /// the Root node must already be traversed by another spill.
+        /// Then no need to repeat.
+      } else if (WorkSet.count(Node)) {
+        break;
+      } else {
+        NodesOnPath.insert(Node);
+      }
+      Node = Node->getIDom();
+    }
+    if (SpillToRm) {
+      SpillsToRm.push_back(SpillToRm);
+    } else {
+      // Add a BB containing the original spills to SpillsToKeep -- i.e.,
+      // set the initial status before hoisting start. The value of BBs
+      // containing original spills is set to 0, in order to descriminate
+      // with BBs containing hoisted spills which will be inserted to
+      // SpillsToKeep later during hoisting.
+      SpillsToKeep[MDT[Block]] = 0;
+      WorkSet.insert(NodesOnPath.begin(), NodesOnPath.end());
+    }
+    NodesOnPath.clear();
+  }
+
+  // Sort the nodes in WorkSet in top-down order and save the nodes
+  // in Orders. Orders will be used for hoisting in runHoistSpills.
+  unsigned idx = 0;
+  Orders.push_back(MDT.DT->getNode(Root));
+  do {
+    MachineDomTreeNode *Node = Orders[idx++];
+    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      if (WorkSet.count(Child))
+        Orders.push_back(Child);
+    }
+  } while (idx != Orders.size());
+  assert(Orders.size() == WorkSet.size() &&
+         "Orders have different size with WorkSet");
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Orders size is " << Orders.size() << "\n");
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++)
+    DEBUG(dbgs() << "BB" << (*RIt)->getBlock()->getNumber() << ",");
+  DEBUG(dbgs() << "\n");
+#endif
+}
+
+/// Try to hoist spills according to BB hotness. The spills to removed will
+/// be saved in \p SpillsToRm. The spills to be inserted will be saved in
+/// \p SpillsToIns.
+///
+void HoistSpillHelper::runHoistSpills(
+    unsigned OrigReg, VNInfo &OrigVNI, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns) {
+  // Visit order of dominator tree nodes.
+  SmallVector<MachineDomTreeNode *, 32> Orders;
+  // SpillsToKeep contains all the nodes where spills are to be inserted
+  // during hoisting. If the spill to be inserted is an original spill
+  // (not a hoisted one), the value of the map entry is 0. If the spill
+  // is a hoisted spill, the value of the map entry is the VReg to be used
+  // as the source of the spill.
+  DenseMap<MachineDomTreeNode *, unsigned> SpillsToKeep;
+  // Map from BB to the first spill inside of it.
+  DenseMap<MachineDomTreeNode *, MachineInstr *> SpillBBToSpill;
+
+  rmRedundantSpills(Spills, SpillsToRm, SpillBBToSpill);
+
+  MachineBasicBlock *Root = LIS.getMBBFromIndex(OrigVNI.def);
+  getVisitOrders(Root, Spills, Orders, SpillsToRm, SpillsToKeep,
+                 SpillBBToSpill);
+
+  // SpillsInSubTreeMap keeps the map from a dom tree node to a pair of
+  // nodes set and the cost of all the spills inside those nodes.
+  // The nodes set are the locations where spills are to be inserted
+  // in the subtree of current node.
+  typedef std::pair<SmallPtrSet<MachineDomTreeNode *, 16>, BlockFrequency>
+      NodesCostPair;
+  DenseMap<MachineDomTreeNode *, NodesCostPair> SpillsInSubTreeMap;
+  // Iterate Orders set in reverse order, which will be a bottom-up order
+  // in the dominator tree. Once we visit a dom tree node, we know its
+  // children have already been visited and the spill locations in the
+  // subtrees of all the children have been determined.
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++) {
+    MachineBasicBlock *Block = (*RIt)->getBlock();
+
+    // If Block contains an original spill, simply continue.
+    if (SpillsToKeep.find(*RIt) != SpillsToKeep.end() && !SpillsToKeep[*RIt]) {
+      SpillsInSubTreeMap[*RIt].first.insert(*RIt);
+      // SpillsInSubTreeMap[*RIt].second contains the cost of spill.
+      SpillsInSubTreeMap[*RIt].second = MBFI.getBlockFreq(Block);
+      continue;
+    }
+
+    // Collect spills in subtree of current node (*RIt) to
+    // SpillsInSubTreeMap[*RIt].first.
+    const std::vector<MachineDomTreeNode *> &Children = (*RIt)->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      auto BI = SpillsInSubTreeMap[Child].first.begin();
+      auto EI = SpillsInSubTreeMap[Child].first.end();
+      SpillsInSubTreeMap[*RIt].first.insert(BI, EI);
+      SpillsInSubTreeMap[*RIt].second += SpillsInSubTreeMap[Child].second;
+      SpillsInSubTreeMap.erase(Child);
+    }
+
+    // No spills in subtree, simply continue.
+    if (SpillsInSubTreeMap[*RIt].first.empty())
+      continue;
+
+    // Check whether Block is a possible candidate to insert spill.
+    unsigned LiveReg = 0;
+    if (!isSpillCandBB(OrigReg, OrigVNI, *Block, LiveReg))
+      continue;
+
+    // If there are multiple spills that could be merged, bias a little
+    // to hoist the spill.
+    BranchProbability MarginProb = (SpillsInSubTreeMap[*RIt].first.size() > 1)
+                                       ? BranchProbability(9, 10)
+                                       : BranchProbability(1, 1);
+    if (SpillsInSubTreeMap[*RIt].second >
+        MBFI.getBlockFreq(Block) * MarginProb) {
+      // Hoist: Move spills to current Block.
+      for (const auto SpillBB : SpillsInSubTreeMap[*RIt].first) {
+        // When SpillBB is a BB contains original spill, insert the spill
+        // to SpillsToRm.
+        if (SpillsToKeep.find(SpillBB) != SpillsToKeep.end() &&
+            !SpillsToKeep[SpillBB]) {
+          MachineInstr *SpillToRm = SpillBBToSpill[SpillBB];
+          SpillsToRm.push_back(SpillToRm);
+        }
+        // SpillBB will not contain spill anymore, remove it from SpillsToKeep.
+        SpillsToKeep.erase(SpillBB);
+      }
+      // Current Block is the BB containing the new hoisted spill. Add it to
+      // SpillsToKeep. LiveReg is the source of the new spill.
+      SpillsToKeep[*RIt] = LiveReg;
+      DEBUG({
+        dbgs() << "spills in BB: ";
+        for (const auto Rspill : SpillsInSubTreeMap[*RIt].first)
+          dbgs() << Rspill->getBlock()->getNumber() << " ";
+        dbgs() << "were promoted to BB" << (*RIt)->getBlock()->getNumber()
+               << "\n";
+      });
+      SpillsInSubTreeMap[*RIt].first.clear();
+      SpillsInSubTreeMap[*RIt].first.insert(*RIt);
+      SpillsInSubTreeMap[*RIt].second = MBFI.getBlockFreq(Block);
+    }
+  }
+  // For spills in SpillsToKeep with LiveReg set (i.e., not original spill),
+  // save them to SpillsToIns.
+  for (const auto Ent : SpillsToKeep) {
+    if (Ent.second)
+      SpillsToIns[Ent.first->getBlock()] = Ent.second;
+  }
+}
+
+/// For spills with equal values, remove redundent spills and hoist the left
+/// to less hot spots.
+///
+/// Spills with equal values will be collected into the same set in
+/// MergeableSpills when spill is inserted. These equal spills are originated
+/// from the same define instruction and are dominated by the instruction.
+/// Before hoisting all the equal spills, redundent spills inside in the same
+/// BB is first marked to be deleted. Then starting from spills left, walk up
+/// on the dominator tree towards the Root node where the define instruction
+/// is located, mark the dominated spills to be deleted along the way and
+/// collect the BB nodes on the path from non-dominated spills to the define
+/// instruction into a WorkSet. The nodes in WorkSet are the candidate places
+/// where we consider to hoist the spills. We iterate the WorkSet in bottom-up
+/// order, and for each node, we will decide whether to hoist spills inside
+/// its subtree to that node. In this way, we can get benefit locally even if
+/// hoisting all the equal spills to one cold place is impossible.
+///
+void HoistSpillHelper::hoistAllSpills(LiveRangeEdit &Edit) {
+  // Save the mapping between stackslot and its original reg.
+  DenseMap<int, unsigned> SlotToOrigReg;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    int Slot = VRM.getStackSlot(Reg);
+    if (Slot != VirtRegMap::NO_STACK_SLOT)
+      SlotToOrigReg[Slot] = VRM.getOriginal(Reg);
+    unsigned Original = VRM.getPreSplitReg(Reg);
+    if (!MRI.def_empty(Reg))
+      Virt2SiblingsMap[Original].insert(Reg);
+  }
+
+  // Each entry in MergeableSpills contains a spill set with equal values.
+  for (auto &Ent : MergeableSpills) {
+    int Slot = Ent.first.first;
+    unsigned OrigReg = SlotToOrigReg[Slot];
+    VNInfo *OrigVNI = Ent.first.second;
+    SmallPtrSet<MachineInstr *, 16> &EqValSpills = Ent.second;
+    if (Ent.second.empty())
+      continue;
+
+    DEBUG({
+      dbgs() << "\nFor Slot" << Slot << " and VN" << OrigVNI->id << ":\n"
+             << "Equal spills in BB: ";
+      for (const auto spill : EqValSpills)
+        dbgs() << spill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // SpillsToRm is the spill set to be removed from EqValSpills.
+    SmallVector<MachineInstr *, 16> SpillsToRm;
+    // SpillsToIns is the spill set to be newly inserted after hoisting.
+    DenseMap<MachineBasicBlock *, unsigned> SpillsToIns;
+
+    runHoistSpills(OrigReg, *OrigVNI, EqValSpills, SpillsToRm, SpillsToIns);
+
+    DEBUG({
+      dbgs() << "Finally inserted spills in BB: ";
+      for (const auto Ispill : SpillsToIns)
+        dbgs() << Ispill.first->getNumber() << " ";
+      dbgs() << "\nFinally removed spills in BB: ";
+      for (const auto Rspill : SpillsToRm)
+        dbgs() << Rspill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // Stack live range update.
+    LiveInterval &StackIntvl = LSS.getInterval(Slot);
+    if (!SpillsToIns.empty() || !SpillsToRm.empty()) {
+      LiveInterval &OrigLI = LIS.getInterval(OrigReg);
+      StackIntvl.MergeValueInAsValue(OrigLI, OrigVNI,
+                                     StackIntvl.getValNumInfo(0));
+    }
+
+    // Insert hoisted spills.
+    for (auto const Insert : SpillsToIns) {
+      MachineBasicBlock *BB = Insert.first;
+      unsigned LiveReg = Insert.second;
+      MachineBasicBlock::iterator MI = BB->getFirstTerminator();
+      TII.storeRegToStackSlot(*BB, MI, LiveReg, false, Slot,
+                              MRI.getRegClass(LiveReg), &TRI);
+      LIS.InsertMachineInstrRangeInMaps(std::prev(MI), MI);
+      ++NumSpills;
+    }
+
+    // Remove redundent spills or change them to dead instructions.
+    NumSpills -= SpillsToRm.size();
+    for (auto const RMEnt : SpillsToRm) {
+      RMEnt->setDesc(TII.get(TargetOpcode::KILL));
+      for (unsigned i = RMEnt->getNumOperands(); i; --i) {
+        MachineOperand &MO = RMEnt->getOperand(i - 1);
+        if (MO.isReg() && MO.isImplicit() && MO.isDef() && !MO.isDead())
+          RMEnt->RemoveOperand(i - 1);
+      }
+    }
+    Edit.eliminateDeadDefs(SpillsToRm, None, true);
+  }
+}

Modified: llvm/trunk/lib/CodeGen/LiveRangeEdit.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/LiveRangeEdit.cpp?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/LiveRangeEdit.cpp (original)
+++ llvm/trunk/lib/CodeGen/LiveRangeEdit.cpp Wed Apr  6 10:41:07 2016
@@ -63,10 +63,13 @@ void LiveRangeEdit::scanRemattable(Alias
   for (VNInfo *VNI : getParent().valnos) {
     if (VNI->isUnused())
       continue;
-    MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
+    unsigned Original = VRM->getOriginal(getReg());
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(OrigVNI->def);
     if (!DefMI)
       continue;
-    checkRematerializable(VNI, DefMI, aa);
+    checkRematerializable(OrigVNI, DefMI, aa);
   }
   ScannedRemattable = true;
 }
@@ -113,24 +116,18 @@ bool LiveRangeEdit::allUsesAvailableAt(c
   return true;
 }
 
-bool LiveRangeEdit::canRematerializeAt(Remat &RM,
-                                       SlotIndex UseIdx,
-                                       bool cheapAsAMove) {
+bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI,
+                                       SlotIndex UseIdx, bool cheapAsAMove) {
   assert(ScannedRemattable && "Call anyRematerializable first");
 
   // Use scanRemattable info.
-  if (!Remattable.count(RM.ParentVNI))
+  if (!Remattable.count(OrigVNI))
     return false;
 
   // No defining instruction provided.
   SlotIndex DefIdx;
-  if (RM.OrigMI)
-    DefIdx = LIS.getInstructionIndex(*RM.OrigMI);
-  else {
-    DefIdx = RM.ParentVNI->def;
-    RM.OrigMI = LIS.getInstructionFromIndex(DefIdx);
-    assert(RM.OrigMI && "No defining instruction for remattable value");
-  }
+  assert(RM.OrigMI && "No defining instruction for remattable value");
+  DefIdx = LIS.getInstructionIndex(*RM.OrigMI);
 
   // If only cheap remats were requested, bail out early.
   if (cheapAsAMove && !TII.isAsCheapAsAMove(RM.OrigMI))
@@ -261,6 +258,15 @@ void LiveRangeEdit::eliminateDeadDef(Mac
   // Collect virtual registers to be erased after MI is gone.
   SmallVector<unsigned, 8> RegsToErase;
   bool ReadsPhysRegs = false;
+  bool isOrigDef = false;
+  unsigned Dest;
+  if (VRM && MI->getOperand(0).isReg()) {
+    Dest = MI->getOperand(0).getReg();
+    unsigned Original = VRM->getOriginal(Dest);
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
+    isOrigDef = SlotIndex::isSameInstr(OrigVNI->def, Idx);
+  }
 
   // Check for live intervals that may shrink
   for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
@@ -314,11 +320,24 @@ void LiveRangeEdit::eliminateDeadDef(Mac
     }
     DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
   } else {
-    if (TheDelegate)
-      TheDelegate->LRE_WillEraseInstruction(MI);
-    LIS.RemoveMachineInstrFromMaps(*MI);
-    MI->eraseFromParent();
-    ++NumDCEDeleted;
+    // If the dest of MI is an original reg, don't delete the inst. Replace
+    // the dest with a new reg, keep the inst for remat of other siblings.
+    // The inst is saved in LiveRangeEdit::DeadRemats and will be deleted
+    // after all the allocations of the func are done.
+    if (isOrigDef) {
+      unsigned NewDest = createFrom(Dest);
+      pop_back();
+      markDeadRemat(MI);
+      const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+      MI->substituteRegister(Dest, NewDest, 0, TRI);
+      MI->getOperand(0).setIsDead(false);
+    } else {
+      if (TheDelegate)
+        TheDelegate->LRE_WillEraseInstruction(MI);
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      ++NumDCEDeleted;
+    }
   }
 
   // Erase any virtregs that are now empty and unused. There may be <undef>
@@ -332,8 +351,9 @@ void LiveRangeEdit::eliminateDeadDef(Mac
   }
 }
 
-void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                                      ArrayRef<unsigned> RegsBeingSpilled) {
+void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                                      ArrayRef<unsigned> RegsBeingSpilled,
+                                      bool NoSplit) {
   ToShrinkSet ToShrink;
 
   for (;;) {
@@ -355,6 +375,9 @@ void LiveRangeEdit::eliminateDeadDefs(Sm
     if (!LIS.shrinkToUses(LI, &Dead))
       continue;
 
+    if (NoSplit)
+      continue;
+
     // Don't create new intervals for a register being spilled.
     // The new intervals would have to be spilled anyway so its not worth it.
     // Also they currently aren't spilled so creating them and not spilling

Modified: llvm/trunk/lib/CodeGen/RegAllocBase.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/RegAllocBase.cpp?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/RegAllocBase.cpp (original)
+++ llvm/trunk/lib/CodeGen/RegAllocBase.cpp Wed Apr  6 10:41:07 2016
@@ -153,3 +153,12 @@ void RegAllocBase::allocatePhysRegs() {
     }
   }
 }
+
+void RegAllocBase::postOptimization() {
+  spiller().postOptimization();
+  for (auto DeadInst : DeadRemats) {
+    LIS->RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}

Modified: llvm/trunk/lib/CodeGen/RegAllocBase.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/RegAllocBase.h?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/RegAllocBase.h (original)
+++ llvm/trunk/lib/CodeGen/RegAllocBase.h Wed Apr  6 10:41:07 2016
@@ -65,6 +65,12 @@ protected:
   LiveRegMatrix *Matrix;
   RegisterClassInfo RegClassInfo;
 
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
   RegAllocBase()
     : TRI(nullptr), MRI(nullptr), VRM(nullptr), LIS(nullptr), Matrix(nullptr) {}
 
@@ -77,6 +83,10 @@ protected:
   // physical register assignments.
   void allocatePhysRegs();
 
+  // Include spiller post optimization and removing dead defs left because of
+  // rematerialization.
+  virtual void postOptimization();
+
   // Get a temporary reference to a Spiller instance.
   virtual Spiller &spiller() = 0;
 

Modified: llvm/trunk/lib/CodeGen/RegAllocBasic.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/RegAllocBasic.cpp?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/RegAllocBasic.cpp (original)
+++ llvm/trunk/lib/CodeGen/RegAllocBasic.cpp Wed Apr  6 10:41:07 2016
@@ -199,7 +199,7 @@ bool RABasic::spillInterferences(LiveInt
     Matrix->unassign(Spill);
 
     // Spill the extracted interval.
-    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM);
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
     spiller().spill(LRE);
   }
   return true;
@@ -258,7 +258,7 @@ unsigned RABasic::selectOrSplit(LiveInte
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
   if (!VirtReg.isSpillable())
     return ~0u;
-  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM);
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
   spiller().spill(LRE);
 
   // The live virtual register requesting allocation was spilled, so tell
@@ -283,6 +283,7 @@ bool RABasic::runOnMachineFunction(Machi
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
+  postOptimization();
 
   // Diagnostic output before rewriting
   DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");

Modified: llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp (original)
+++ llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp Wed Apr  6 10:41:07 2016
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
@@ -33,6 +32,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -44,6 +44,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <queue>
 
@@ -55,14 +56,14 @@ STATISTIC(NumGlobalSplits, "Number of sp
 STATISTIC(NumLocalSplits,  "Number of split local live ranges");
 STATISTIC(NumEvicted,      "Number of interferences evicted");
 
-static cl::opt<SplitEditor::ComplementSpillMode>
-SplitSpillMode("split-spill-mode", cl::Hidden,
-  cl::desc("Spill mode for splitting live ranges"),
-  cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
-             clEnumValN(SplitEditor::SM_Size,  "size",  "Optimize for size"),
-             clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
-             clEnumValEnd),
-  cl::init(SplitEditor::SM_Partition));
+static cl::opt<SplitEditor::ComplementSpillMode> SplitSpillMode(
+    "split-spill-mode", cl::Hidden,
+    cl::desc("Spill mode for splitting live ranges"),
+    cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
+               clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"),
+               clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
+               clEnumValEnd),
+    cl::init(SplitEditor::SM_Speed));
 
 static cl::opt<unsigned>
 LastChanceRecoloringMaxDepth("lcr-max-depth", cl::Hidden,
@@ -1465,7 +1466,7 @@ unsigned RAGreedy::doRegionSplit(LiveInt
                                  SmallVectorImpl<unsigned> &NewVRegs) {
   SmallVector<unsigned, 8> UsedCands;
   // Prepare split editor.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit, SplitSpillMode);
 
   // Assign all edge bundles to the preferred candidate, or NoCand.
@@ -1513,7 +1514,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInt
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit, SplitSpillMode);
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
@@ -1585,7 +1586,7 @@ RAGreedy::tryInstructionSplit(LiveInterv
 
   // Always enable split spill mode, since we're effectively spilling to a
   // register.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit, SplitEditor::SM_Size);
 
   ArrayRef<SlotIndex> Uses = SA->getUseSlots();
@@ -1908,7 +1909,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInt
                << '-' << Uses[BestAfter] << ", " << BestDiff
                << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
 
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit);
 
   SE->openIntv();
@@ -2551,7 +2552,7 @@ unsigned RAGreedy::selectOrSplitImpl(Liv
     NewVRegs.push_back(VirtReg.reg);
   } else {
     NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
-    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
     spiller().spill(LRE);
     setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
 
@@ -2609,6 +2610,8 @@ bool RAGreedy::runOnMachineFunction(Mach
 
   allocatePhysRegs();
   tryHintsRecoloring();
+  postOptimization();
+
   releaseMemory();
   return true;
 }

Modified: llvm/trunk/lib/CodeGen/RegAllocPBQP.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/RegAllocPBQP.cpp?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/RegAllocPBQP.cpp (original)
+++ llvm/trunk/lib/CodeGen/RegAllocPBQP.cpp Wed Apr  6 10:41:07 2016
@@ -123,6 +123,12 @@ private:
 
   RegSet VRegsToAlloc, EmptyIntervalVRegs;
 
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
   /// \brief Finds the initial set of vreg intervals to allocate.
   void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
 
@@ -146,6 +152,7 @@ private:
   void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS,
                      VirtRegMap &VRM) const;
 
+  void postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS);
 };
 
 char RegAllocPBQP::ID = 0;
@@ -631,7 +638,8 @@ void RegAllocPBQP::spillVReg(unsigned VR
                              VirtRegMap &VRM, Spiller &VRegSpiller) {
 
   VRegsToAlloc.erase(VReg);
-  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM);
+  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM,
+                    nullptr, &DeadRemats);
   VRegSpiller.spill(LRE);
 
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -713,6 +721,16 @@ void RegAllocPBQP::finalizeAlloc(Machine
   }
 }
 
+void RegAllocPBQP::postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS) {
+  VRegSpiller.postOptimization();
+  /// Remove dead defs because of rematerialization.
+  for (auto DeadInst : DeadRemats) {
+    LIS.RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
+
 static inline float normalizePBQPSpillWeight(float UseDefFreq, unsigned Size,
                                          unsigned NumInstr) {
   // All intervals have a spill weight that is mostly proportional to the number
@@ -798,6 +816,7 @@ bool RegAllocPBQP::runOnMachineFunction(
 
   // Finalise allocation, allocate empty ranges.
   finalizeAlloc(MF, LIS, VRM);
+  postOptimization(*VRegSpiller, LIS);
   VRegsToAlloc.clear();
   EmptyIntervalVRegs.clear();
 

Modified: llvm/trunk/lib/CodeGen/Spiller.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/Spiller.h?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/Spiller.h (original)
+++ llvm/trunk/lib/CodeGen/Spiller.h Wed Apr  6 10:41:07 2016
@@ -16,6 +16,7 @@ namespace llvm {
   class MachineFunction;
   class MachineFunctionPass;
   class VirtRegMap;
+  class LiveIntervals;
 
   /// Spiller interface.
   ///
@@ -28,7 +29,7 @@ namespace llvm {
 
     /// spill - Spill the LRE.getParent() live interval.
     virtual void spill(LiveRangeEdit &LRE) = 0;
-
+    virtual void postOptimization(){};
   };
 
   /// Create and return a spiller that will insert spill code directly instead
@@ -36,7 +37,6 @@ namespace llvm {
   Spiller *createInlineSpiller(MachineFunctionPass &pass,
                                MachineFunction &mf,
                                VirtRegMap &vrm);
-
 }
 
 #endif

Modified: llvm/trunk/lib/CodeGen/SplitKit.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SplitKit.cpp?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SplitKit.cpp (original)
+++ llvm/trunk/lib/CodeGen/SplitKit.cpp Wed Apr  6 10:41:07 2016
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -430,8 +431,13 @@ VNInfo *SplitEditor::defFromParent(unsig
   bool Late = RegIdx != 0;
 
   // Attempt cheap-as-a-copy rematerialization.
+  unsigned Original = VRM.getOriginal(Edit->get(RegIdx));
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
   LiveRangeEdit::Remat RM(ParentVNI);
-  if (Edit->canRematerializeAt(RM, UseIdx, true)) {
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
+  if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
     Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
     ++NumRemats;
   } else {
@@ -716,7 +722,62 @@ SplitEditor::findShallowDominator(Machin
   }
 }
 
-void SplitEditor::hoistCopiesForSize() {
+void SplitEditor::computeRedundantBackCopies(
+    DenseSet<unsigned> &NotToHoistSet, SmallVectorImpl<VNInfo *> &BackCopies) {
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  LiveInterval *Parent = &Edit->getParent();
+  SmallVector<SmallPtrSet<VNInfo *, 8>, 8> EqualVNs(Parent->getNumValNums());
+  SmallPtrSet<VNInfo *, 8> DominatedVNIs;
+
+  // Aggregate VNIs having the same value as ParentVNI.
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    EqualVNs[ParentVNI->id].insert(VNI);
+  }
+
+  // For VNI aggregation of each ParentVNI, collect dominated, i.e.,
+  // redundant VNIs to BackCopies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    if (!NotToHoistSet.count(ParentVNI->id))
+      continue;
+    SmallPtrSetIterator<VNInfo *> It1 = EqualVNs[ParentVNI->id].begin();
+    SmallPtrSetIterator<VNInfo *> It2 = It1;
+    for (; It1 != EqualVNs[ParentVNI->id].end(); ++It1) {
+      It2 = It1;
+      for (++It2; It2 != EqualVNs[ParentVNI->id].end(); ++It2) {
+        if (DominatedVNIs.count(*It1) || DominatedVNIs.count(*It2))
+          continue;
+
+        MachineBasicBlock *MBB1 = LIS.getMBBFromIndex((*It1)->def);
+        MachineBasicBlock *MBB2 = LIS.getMBBFromIndex((*It2)->def);
+        if (MBB1 == MBB2) {
+          DominatedVNIs.insert((*It1)->def < (*It2)->def ? (*It2) : (*It1));
+        } else if (MDT.dominates(MBB1, MBB2)) {
+          DominatedVNIs.insert(*It2);
+        } else if (MDT.dominates(MBB2, MBB1)) {
+          DominatedVNIs.insert(*It1);
+        }
+      }
+    }
+    if (!DominatedVNIs.empty()) {
+      forceRecompute(0, ParentVNI);
+      for (auto VNI : DominatedVNIs) {
+        BackCopies.push_back(VNI);
+      }
+      DominatedVNIs.clear();
+    }
+  }
+}
+
+/// For SM_Size mode, find a common dominator for all the back-copies for
+/// the same ParentVNI and hoist the backcopies to the dominator BB.
+/// For SM_Speed mode, if the common dominator is hot and it is not beneficial
+/// to do the hoisting, simply remove the dominated backcopies for the same
+/// ParentVNI.
+void SplitEditor::hoistCopies() {
   // Get the complement interval, always RegIdx 0.
   LiveInterval *LI = &LIS.getInterval(Edit->get(0));
   LiveInterval *Parent = &Edit->getParent();
@@ -725,6 +786,11 @@ void SplitEditor::hoistCopiesForSize() {
   // indexed by ParentVNI->id.
   typedef std::pair<MachineBasicBlock*, SlotIndex> DomPair;
   SmallVector<DomPair, 8> NearestDom(Parent->getNumValNums());
+  // The total cost of all the back-copies for each ParentVNI.
+  SmallVector<BlockFrequency, 8> Costs(Parent->getNumValNums());
+  // The ParentVNI->id set for which hoisting back-copies are not beneficial
+  // for Speed.
+  DenseSet<unsigned> NotToHoistSet;
 
   // Find the nearest common dominator for parent values with multiple
   // back-copies.  If a single back-copy dominates, put it in DomPair.second.
@@ -740,6 +806,7 @@ void SplitEditor::hoistCopiesForSize() {
       continue;
 
     MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def);
+
     DomPair &Dom = NearestDom[ParentVNI->id];
 
     // Keep directly defined parent values.  This is either a PHI or an
@@ -774,6 +841,7 @@ void SplitEditor::hoistCopiesForSize() {
       else if (Near != Dom.first)
         // None dominate. Hoist to common dominator, need new def.
         Dom = DomPair(Near, SlotIndex());
+      Costs[ParentVNI->id] += MBFI.getBlockFreq(ValMBB);
     }
 
     DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def
@@ -792,6 +860,11 @@ void SplitEditor::hoistCopiesForSize() {
     MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
     // Get a less loopy dominator than Dom.first.
     Dom.first = findShallowDominator(Dom.first, DefMBB);
+    if (SpillMode == SM_Speed &&
+        MBFI.getBlockFreq(Dom.first) > Costs[ParentVNI->id]) {
+      NotToHoistSet.insert(ParentVNI->id);
+      continue;
+    }
     SlotIndex Last = LIS.getMBBEndIdx(Dom.first).getPrevSlot();
     Dom.second =
       defFromParent(0, ParentVNI, Last, *Dom.first,
@@ -806,11 +879,18 @@ void SplitEditor::hoistCopiesForSize() {
       continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
     const DomPair &Dom = NearestDom[ParentVNI->id];
-    if (!Dom.first || Dom.second == VNI->def)
+    if (!Dom.first || Dom.second == VNI->def ||
+        NotToHoistSet.count(ParentVNI->id))
       continue;
     BackCopies.push_back(VNI);
     forceRecompute(0, ParentVNI);
   }
+
+  // If it is not beneficial to hoist all the BackCopies, simply remove
+  // redundant BackCopies in speed mode.
+  if (SpillMode == SM_Speed && !NotToHoistSet.empty())
+    computeRedundantBackCopies(NotToHoistSet, BackCopies);
+
   removeBackCopies(BackCopies);
 }
 
@@ -1004,6 +1084,8 @@ void SplitEditor::deleteRematVictims() {
       // Dead defs end at the dead slot.
       if (S.end != S.valno->def.getDeadSlot())
         continue;
+      if (S.valno->isPHIDef())
+        continue;
       MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
       assert(MI && "Missing instruction for dead def");
       MI->addRegisterDead(LI->reg, &TRI);
@@ -1048,10 +1130,9 @@ void SplitEditor::finish(SmallVectorImpl
     // Leave all back-copies as is.
     break;
   case SM_Size:
-    hoistCopiesForSize();
-    break;
   case SM_Speed:
-    llvm_unreachable("Spill mode 'speed' not implemented yet");
+    // hoistCopies will behave differently between size and speed.
+    hoistCopies();
   }
 
   // Transfer the simply mapped values, check if any are skipped.

Modified: llvm/trunk/lib/CodeGen/SplitKit.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SplitKit.h?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SplitKit.h (original)
+++ llvm/trunk/lib/CodeGen/SplitKit.h Wed Apr  6 10:41:07 2016
@@ -18,6 +18,7 @@
 #include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
@@ -329,9 +330,14 @@ private:
   MachineBasicBlock *findShallowDominator(MachineBasicBlock *MBB,
                                           MachineBasicBlock *DefMBB);
 
-  /// hoistCopiesForSize - Hoist back-copies to the complement interval in a
-  /// way that minimizes code size. This implements the SM_Size spill mode.
-  void hoistCopiesForSize();
+  /// Find out all the backCopies dominated by others.
+  void computeRedundantBackCopies(DenseSet<unsigned> &NotToHoistSet,
+                                  SmallVectorImpl<VNInfo *> &BackCopies);
+
+  /// Hoist back-copies to the complement interval. It tries to hoist all
+  /// the back-copies to one BB if it is beneficial, or else simply remove
+  /// redundent backcopies dominated by others.
+  void hoistCopies();
 
   /// transferValues - Transfer values to the new ranges.
   /// Return true if any ranges were skipped.

Removed: llvm/trunk/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/aarch64-deferred-spilling.ll?rev=265546&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/aarch64-deferred-spilling.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/aarch64-deferred-spilling.ll (removed)
@@ -1,514 +0,0 @@
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=true -mcpu=cortex-a57 -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=DEFERRED
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=false -mcpu=cortex-a57 -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=REGULAR
-
-; Check that we do not end up with useless spill code.
-;
-; Move to the basic block we are interested in.
-;
-; CHECK: // %if.then.120
-;
-; REGULAR: str w21, [sp, #[[OFFSET:[0-9]+]]] // 4-byte Folded Spill
-; Check that w21 wouldn't need to be spilled since it is never reused.
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-;
-; Check that w22 is used to carry a value through the call.
-; DEFERRED-NOT: str {{[wx]}}22,
-; DEFERRED: mov {{[wx]}}22,
-; DEFERRED-NOT: str {{[wx]}}22,
-;
-; CHECK:        bl      fprintf
-;
-; DEFERRED-NOT: ldr {{[wx]}}22,
-; DEFERRED: mov {{[wx][0-9]+}}, {{[wx]}}22
-; DEFERRED-NOT: ldr {{[wx]}}22,
-;
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-; REGULAR: ldr w21, [sp, #[[OFFSET]]] // 4-byte Folded Reload
-;
-; End of the basic block we are interested in.
-; CHECK:        b
-; CHECK: {{[^:]+}}: // %sw.bb.123
-
-%struct.__sFILE = type { i8*, i32, i32, i32, i32, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
-%struct.__sbuf = type { i8*, i64 }
-%struct.DState = type { %struct.bz_stream*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* }
-%struct.bz_stream = type { i8*, i32, i32, i32, i8*, i32, i32, i32, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8* }
-
- at __sF = external global [0 x %struct.__sFILE], align 8
- at .str = private unnamed_addr constant [20 x i8] c"\0A    [%d: stuff+mf \00", align 1
-
-declare i32 @fprintf(%struct.__sFILE* nocapture, i8* nocapture readonly, ...)
-
-declare void @bar(i32)
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-
-define i32 @foo(%struct.DState* %s) {
-entry:
-  %state = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 1
-  %tmp = load i32, i32* %state, align 4
-  %cmp = icmp eq i32 %tmp, 10
-  %save_i = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 40
-  br i1 %cmp, label %if.end.thread, label %if.end
-
-if.end.thread:                                    ; preds = %entry
-  %save_j = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
-  %save_t = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
-  %save_alphaSize = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
-  %save_nGroups = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
-  %save_nSelectors = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
-  %save_EOB = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
-  %save_groupNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
-  %save_groupPos = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
-  %save_nextSym = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
-  %save_nblockMAX = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
-  %save_nblock = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
-  %save_es = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
-  %save_N = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
-  %save_curr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
-  %save_zt = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
-  %save_zn = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
-  %save_zvec = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
-  %save_zj = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
-  %tmp1 = bitcast i32* %save_i to i8*
-  call void @llvm.memset.p0i8.i64(i8* %tmp1, i8 0, i64 108, i32 4, i1 false)
-  br label %sw.default
-
-if.end:                                           ; preds = %entry
-  %.pre = load i32, i32* %save_i, align 4
-  %save_j3.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
-  %.pre406 = load i32, i32* %save_j3.phi.trans.insert, align 4
-  %save_t4.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
-  %.pre407 = load i32, i32* %save_t4.phi.trans.insert, align 4
-  %save_alphaSize5.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
-  %.pre408 = load i32, i32* %save_alphaSize5.phi.trans.insert, align 4
-  %save_nGroups6.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
-  %.pre409 = load i32, i32* %save_nGroups6.phi.trans.insert, align 4
-  %save_nSelectors7.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
-  %.pre410 = load i32, i32* %save_nSelectors7.phi.trans.insert, align 4
-  %save_EOB8.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
-  %.pre411 = load i32, i32* %save_EOB8.phi.trans.insert, align 4
-  %save_groupNo9.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
-  %.pre412 = load i32, i32* %save_groupNo9.phi.trans.insert, align 4
-  %save_groupPos10.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
-  %.pre413 = load i32, i32* %save_groupPos10.phi.trans.insert, align 4
-  %save_nextSym11.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
-  %.pre414 = load i32, i32* %save_nextSym11.phi.trans.insert, align 4
-  %save_nblockMAX12.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
-  %.pre415 = load i32, i32* %save_nblockMAX12.phi.trans.insert, align 4
-  %save_nblock13.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
-  %.pre416 = load i32, i32* %save_nblock13.phi.trans.insert, align 4
-  %save_es14.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
-  %.pre417 = load i32, i32* %save_es14.phi.trans.insert, align 4
-  %save_N15.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
-  %.pre418 = load i32, i32* %save_N15.phi.trans.insert, align 4
-  %save_curr16.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
-  %.pre419 = load i32, i32* %save_curr16.phi.trans.insert, align 4
-  %save_zt17.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
-  %.pre420 = load i32, i32* %save_zt17.phi.trans.insert, align 4
-  %save_zn18.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
-  %.pre421 = load i32, i32* %save_zn18.phi.trans.insert, align 4
-  %save_zvec19.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
-  %.pre422 = load i32, i32* %save_zvec19.phi.trans.insert, align 4
-  %save_zj20.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
-  %.pre423 = load i32, i32* %save_zj20.phi.trans.insert, align 4
-  switch i32 %tmp, label %sw.default [
-    i32 13, label %sw.bb
-    i32 14, label %if.end.sw.bb.65_crit_edge
-    i32 25, label %if.end.sw.bb.123_crit_edge
-  ]
-
-if.end.sw.bb.123_crit_edge:                       ; preds = %if.end
-  %.pre433 = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  br label %sw.bb.123
-
-if.end.sw.bb.65_crit_edge:                        ; preds = %if.end
-  %bsLive69.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  %.pre426 = load i32, i32* %bsLive69.phi.trans.insert, align 4
-  br label %sw.bb.65
-
-sw.bb:                                            ; preds = %if.end
-  %sunkaddr = ptrtoint %struct.DState* %s to i64
-  %sunkaddr485 = add i64 %sunkaddr, 8
-  %sunkaddr486 = inttoptr i64 %sunkaddr485 to i32*
-  store i32 13, i32* %sunkaddr486, align 4
-  %bsLive = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  %tmp2 = load i32, i32* %bsLive, align 4
-  %cmp28.400 = icmp sgt i32 %tmp2, 7
-  br i1 %cmp28.400, label %sw.bb.if.then.29_crit_edge, label %if.end.33.lr.ph
-
-sw.bb.if.then.29_crit_edge:                       ; preds = %sw.bb
-  %sunkaddr487 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr488 = add i64 %sunkaddr487, 32
-  %sunkaddr489 = inttoptr i64 %sunkaddr488 to i32*
-  %.pre425 = load i32, i32* %sunkaddr489, align 4
-  br label %if.then.29
-
-if.end.33.lr.ph:                                  ; preds = %sw.bb
-  %tmp3 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre424 = load %struct.bz_stream*, %struct.bz_stream** %tmp3, align 8
-  %avail_in.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre424, i64 0, i32 1
-  %.pre430 = load i32, i32* %avail_in.phi.trans.insert, align 4
-  %tmp4 = add i32 %.pre430, -1
-  br label %if.end.33
-
-if.then.29:                                       ; preds = %while.body.backedge, %sw.bb.if.then.29_crit_edge
-  %tmp5 = phi i32 [ %.pre425, %sw.bb.if.then.29_crit_edge ], [ %or, %while.body.backedge ]
-  %.lcssa393 = phi i32 [ %tmp2, %sw.bb.if.then.29_crit_edge ], [ %add, %while.body.backedge ]
-  %sub = add nsw i32 %.lcssa393, -8
-  %shr = lshr i32 %tmp5, %sub
-  %and = and i32 %shr, 255
-  %sunkaddr491 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr492 = add i64 %sunkaddr491, 36
-  %sunkaddr493 = inttoptr i64 %sunkaddr492 to i32*
-  store i32 %sub, i32* %sunkaddr493, align 4
-  %blockSize100k = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 9
-  store i32 %and, i32* %blockSize100k, align 4
-  %and.off = add nsw i32 %and, -49
-  %tmp6 = icmp ugt i32 %and.off, 8
-  br i1 %tmp6, label %save_state_and_return, label %if.end.62
-
-if.end.33:                                        ; preds = %while.body.backedge, %if.end.33.lr.ph
-  %lsr.iv482 = phi i32 [ %tmp4, %if.end.33.lr.ph ], [ %lsr.iv.next483, %while.body.backedge ]
-  %tmp7 = phi i32 [ %tmp2, %if.end.33.lr.ph ], [ %add, %while.body.backedge ]
-  %cmp35 = icmp eq i32 %lsr.iv482, -1
-  br i1 %cmp35, label %save_state_and_return, label %if.end.37
-
-if.end.37:                                        ; preds = %if.end.33
-  %tmp8 = bitcast %struct.bz_stream* %.pre424 to i8**
-  %sunkaddr494 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr495 = add i64 %sunkaddr494, 32
-  %sunkaddr496 = inttoptr i64 %sunkaddr495 to i32*
-  %tmp9 = load i32, i32* %sunkaddr496, align 4
-  %shl = shl i32 %tmp9, 8
-  %tmp10 = load i8*, i8** %tmp8, align 8
-  %tmp11 = load i8, i8* %tmp10, align 1
-  %conv = zext i8 %tmp11 to i32
-  %or = or i32 %conv, %shl
-  store i32 %or, i32* %sunkaddr496, align 4
-  %add = add nsw i32 %tmp7, 8
-  %sunkaddr497 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr498 = add i64 %sunkaddr497, 36
-  %sunkaddr499 = inttoptr i64 %sunkaddr498 to i32*
-  store i32 %add, i32* %sunkaddr499, align 4
-  %incdec.ptr = getelementptr inbounds i8, i8* %tmp10, i64 1
-  store i8* %incdec.ptr, i8** %tmp8, align 8
-  %sunkaddr500 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr501 = add i64 %sunkaddr500, 8
-  %sunkaddr502 = inttoptr i64 %sunkaddr501 to i32*
-  store i32 %lsr.iv482, i32* %sunkaddr502, align 4
-  %sunkaddr503 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr504 = add i64 %sunkaddr503, 12
-  %sunkaddr505 = inttoptr i64 %sunkaddr504 to i32*
-  %tmp12 = load i32, i32* %sunkaddr505, align 4
-  %inc = add i32 %tmp12, 1
-  store i32 %inc, i32* %sunkaddr505, align 4
-  %cmp49 = icmp eq i32 %inc, 0
-  br i1 %cmp49, label %if.then.51, label %while.body.backedge
-
-if.then.51:                                       ; preds = %if.end.37
-  %sunkaddr506 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr507 = add i64 %sunkaddr506, 16
-  %sunkaddr508 = inttoptr i64 %sunkaddr507 to i32*
-  %tmp13 = load i32, i32* %sunkaddr508, align 4
-  %inc53 = add i32 %tmp13, 1
-  store i32 %inc53, i32* %sunkaddr508, align 4
-  br label %while.body.backedge
-
-while.body.backedge:                              ; preds = %if.then.51, %if.end.37
-  %lsr.iv.next483 = add i32 %lsr.iv482, -1
-  %cmp28 = icmp sgt i32 %add, 7
-  br i1 %cmp28, label %if.then.29, label %if.end.33
-
-if.end.62:                                        ; preds = %if.then.29
-  %sub64 = add nsw i32 %and, -48
-  %sunkaddr509 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr510 = add i64 %sunkaddr509, 40
-  %sunkaddr511 = inttoptr i64 %sunkaddr510 to i32*
-  store i32 %sub64, i32* %sunkaddr511, align 4
-  br label %sw.bb.65
-
-sw.bb.65:                                         ; preds = %if.end.62, %if.end.sw.bb.65_crit_edge
-  %bsLive69.pre-phi = phi i32* [ %bsLive69.phi.trans.insert, %if.end.sw.bb.65_crit_edge ], [ %bsLive, %if.end.62 ]
-  %tmp14 = phi i32 [ %.pre426, %if.end.sw.bb.65_crit_edge ], [ %sub, %if.end.62 ]
-  %sunkaddr512 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr513 = add i64 %sunkaddr512, 8
-  %sunkaddr514 = inttoptr i64 %sunkaddr513 to i32*
-  store i32 14, i32* %sunkaddr514, align 4
-  %cmp70.397 = icmp sgt i32 %tmp14, 7
-  br i1 %cmp70.397, label %if.then.72, label %if.end.82.lr.ph
-
-if.end.82.lr.ph:                                  ; preds = %sw.bb.65
-  %tmp15 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre427 = load %struct.bz_stream*, %struct.bz_stream** %tmp15, align 8
-  %avail_in84.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre427, i64 0, i32 1
-  %.pre431 = load i32, i32* %avail_in84.phi.trans.insert, align 4
-  %tmp16 = add i32 %.pre431, -1
-  br label %if.end.82
-
-if.then.72:                                       ; preds = %while.body.68.backedge, %sw.bb.65
-  %.lcssa390 = phi i32 [ %tmp14, %sw.bb.65 ], [ %add97, %while.body.68.backedge ]
-  %sub76 = add nsw i32 %.lcssa390, -8
-  %sunkaddr516 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr517 = add i64 %sunkaddr516, 36
-  %sunkaddr518 = inttoptr i64 %sunkaddr517 to i32*
-  store i32 %sub76, i32* %sunkaddr518, align 4
-  %currBlockNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 11
-  %tmp17 = load i32, i32* %currBlockNo, align 4
-  %inc117 = add nsw i32 %tmp17, 1
-  store i32 %inc117, i32* %currBlockNo, align 4
-  %verbosity = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 12
-  %tmp18 = load i32, i32* %verbosity, align 4
-  %cmp118 = icmp sgt i32 %tmp18, 1
-  br i1 %cmp118, label %if.then.120, label %sw.bb.123, !prof !0
-
-if.end.82:                                        ; preds = %while.body.68.backedge, %if.end.82.lr.ph
-  %lsr.iv480 = phi i32 [ %tmp16, %if.end.82.lr.ph ], [ %lsr.iv.next481, %while.body.68.backedge ]
-  %tmp19 = phi i32 [ %tmp14, %if.end.82.lr.ph ], [ %add97, %while.body.68.backedge ]
-  %cmp85 = icmp eq i32 %lsr.iv480, -1
-  br i1 %cmp85, label %save_state_and_return, label %if.end.88
-
-if.end.88:                                        ; preds = %if.end.82
-  %tmp20 = bitcast %struct.bz_stream* %.pre427 to i8**
-  %sunkaddr519 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr520 = add i64 %sunkaddr519, 32
-  %sunkaddr521 = inttoptr i64 %sunkaddr520 to i32*
-  %tmp21 = load i32, i32* %sunkaddr521, align 4
-  %shl90 = shl i32 %tmp21, 8
-  %tmp22 = load i8*, i8** %tmp20, align 8
-  %tmp23 = load i8, i8* %tmp22, align 1
-  %conv93 = zext i8 %tmp23 to i32
-  %or94 = or i32 %conv93, %shl90
-  store i32 %or94, i32* %sunkaddr521, align 4
-  %add97 = add nsw i32 %tmp19, 8
-  %sunkaddr522 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr523 = add i64 %sunkaddr522, 36
-  %sunkaddr524 = inttoptr i64 %sunkaddr523 to i32*
-  store i32 %add97, i32* %sunkaddr524, align 4
-  %incdec.ptr100 = getelementptr inbounds i8, i8* %tmp22, i64 1
-  store i8* %incdec.ptr100, i8** %tmp20, align 8
-  %sunkaddr525 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr526 = add i64 %sunkaddr525, 8
-  %sunkaddr527 = inttoptr i64 %sunkaddr526 to i32*
-  store i32 %lsr.iv480, i32* %sunkaddr527, align 4
-  %sunkaddr528 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr529 = add i64 %sunkaddr528, 12
-  %sunkaddr530 = inttoptr i64 %sunkaddr529 to i32*
-  %tmp24 = load i32, i32* %sunkaddr530, align 4
-  %inc106 = add i32 %tmp24, 1
-  store i32 %inc106, i32* %sunkaddr530, align 4
-  %cmp109 = icmp eq i32 %inc106, 0
-  br i1 %cmp109, label %if.then.111, label %while.body.68.backedge
-
-if.then.111:                                      ; preds = %if.end.88
-  %sunkaddr531 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr532 = add i64 %sunkaddr531, 16
-  %sunkaddr533 = inttoptr i64 %sunkaddr532 to i32*
-  %tmp25 = load i32, i32* %sunkaddr533, align 4
-  %inc114 = add i32 %tmp25, 1
-  store i32 %inc114, i32* %sunkaddr533, align 4
-  br label %while.body.68.backedge
-
-while.body.68.backedge:                           ; preds = %if.then.111, %if.end.88
-  %lsr.iv.next481 = add i32 %lsr.iv480, -1
-  %cmp70 = icmp sgt i32 %add97, 7
-  br i1 %cmp70, label %if.then.72, label %if.end.82
-
-if.then.120:                                      ; preds = %if.then.72
-  %call = tail call i32 (%struct.__sFILE*, i8*, ...) @fprintf(%struct.__sFILE* getelementptr inbounds ([0 x %struct.__sFILE], [0 x %struct.__sFILE]* @__sF, i64 0, i64 2), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str, i64 0, i64 0), i32 %inc117)
-  br label %sw.bb.123
-
-sw.bb.123:                                        ; preds = %if.then.120, %if.then.72, %if.end.sw.bb.123_crit_edge
-  %bsLive127.pre-phi = phi i32* [ %.pre433, %if.end.sw.bb.123_crit_edge ], [ %bsLive69.pre-phi, %if.then.72 ], [ %bsLive69.pre-phi, %if.then.120 ]
-  %sunkaddr534 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr535 = add i64 %sunkaddr534, 8
-  %sunkaddr536 = inttoptr i64 %sunkaddr535 to i32*
-  store i32 25, i32* %sunkaddr536, align 4
-  %tmp26 = load i32, i32* %bsLive127.pre-phi, align 4
-  %cmp128.395 = icmp sgt i32 %tmp26, 7
-  br i1 %cmp128.395, label %sw.bb.123.if.then.130_crit_edge, label %if.end.140.lr.ph
-
-sw.bb.123.if.then.130_crit_edge:                  ; preds = %sw.bb.123
-  %sunkaddr537 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr538 = add i64 %sunkaddr537, 32
-  %sunkaddr539 = inttoptr i64 %sunkaddr538 to i32*
-  %.pre429 = load i32, i32* %sunkaddr539, align 4
-  br label %if.then.130
-
-if.end.140.lr.ph:                                 ; preds = %sw.bb.123
-  %tmp27 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre428 = load %struct.bz_stream*, %struct.bz_stream** %tmp27, align 8
-  %avail_in142.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre428, i64 0, i32 1
-  %.pre432 = load i32, i32* %avail_in142.phi.trans.insert, align 4
-  %tmp28 = add i32 %.pre432, -1
-  br label %if.end.140
-
-if.then.130:                                      ; preds = %while.body.126.backedge, %sw.bb.123.if.then.130_crit_edge
-  %tmp29 = phi i32 [ %.pre429, %sw.bb.123.if.then.130_crit_edge ], [ %or152, %while.body.126.backedge ]
-  %.lcssa = phi i32 [ %tmp26, %sw.bb.123.if.then.130_crit_edge ], [ %add155, %while.body.126.backedge ]
-  %sub134 = add nsw i32 %.lcssa, -8
-  %shr135 = lshr i32 %tmp29, %sub134
-  store i32 %sub134, i32* %bsLive127.pre-phi, align 4
-  %origPtr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 13
-  %tmp30 = load i32, i32* %origPtr, align 4
-  %shl175 = shl i32 %tmp30, 8
-  %conv176 = and i32 %shr135, 255
-  %or177 = or i32 %shl175, %conv176
-  store i32 %or177, i32* %origPtr, align 4
-  %nInUse = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 27
-  %tmp31 = load i32, i32* %nInUse, align 4
-  %add179 = add nsw i32 %tmp31, 2
-  br label %save_state_and_return
-
-if.end.140:                                       ; preds = %while.body.126.backedge, %if.end.140.lr.ph
-  %lsr.iv = phi i32 [ %tmp28, %if.end.140.lr.ph ], [ %lsr.iv.next, %while.body.126.backedge ]
-  %tmp32 = phi i32 [ %tmp26, %if.end.140.lr.ph ], [ %add155, %while.body.126.backedge ]
-  %cmp143 = icmp eq i32 %lsr.iv, -1
-  br i1 %cmp143, label %save_state_and_return, label %if.end.146
-
-if.end.146:                                       ; preds = %if.end.140
-  %tmp33 = bitcast %struct.bz_stream* %.pre428 to i8**
-  %sunkaddr541 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr542 = add i64 %sunkaddr541, 32
-  %sunkaddr543 = inttoptr i64 %sunkaddr542 to i32*
-  %tmp34 = load i32, i32* %sunkaddr543, align 4
-  %shl148 = shl i32 %tmp34, 8
-  %tmp35 = load i8*, i8** %tmp33, align 8
-  %tmp36 = load i8, i8* %tmp35, align 1
-  %conv151 = zext i8 %tmp36 to i32
-  %or152 = or i32 %conv151, %shl148
-  store i32 %or152, i32* %sunkaddr543, align 4
-  %add155 = add nsw i32 %tmp32, 8
-  store i32 %add155, i32* %bsLive127.pre-phi, align 4
-  %incdec.ptr158 = getelementptr inbounds i8, i8* %tmp35, i64 1
-  store i8* %incdec.ptr158, i8** %tmp33, align 8
-  %sunkaddr544 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr545 = add i64 %sunkaddr544, 8
-  %sunkaddr546 = inttoptr i64 %sunkaddr545 to i32*
-  store i32 %lsr.iv, i32* %sunkaddr546, align 4
-  %sunkaddr547 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr548 = add i64 %sunkaddr547, 12
-  %sunkaddr549 = inttoptr i64 %sunkaddr548 to i32*
-  %tmp37 = load i32, i32* %sunkaddr549, align 4
-  %inc164 = add i32 %tmp37, 1
-  store i32 %inc164, i32* %sunkaddr549, align 4
-  %cmp167 = icmp eq i32 %inc164, 0
-  br i1 %cmp167, label %if.then.169, label %while.body.126.backedge
-
-if.then.169:                                      ; preds = %if.end.146
-  %sunkaddr550 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr551 = add i64 %sunkaddr550, 16
-  %sunkaddr552 = inttoptr i64 %sunkaddr551 to i32*
-  %tmp38 = load i32, i32* %sunkaddr552, align 4
-  %inc172 = add i32 %tmp38, 1
-  store i32 %inc172, i32* %sunkaddr552, align 4
-  br label %while.body.126.backedge
-
-while.body.126.backedge:                          ; preds = %if.then.169, %if.end.146
-  %lsr.iv.next = add i32 %lsr.iv, -1
-  %cmp128 = icmp sgt i32 %add155, 7
-  br i1 %cmp128, label %if.then.130, label %if.end.140
-
-sw.default:                                       ; preds = %if.end, %if.end.thread
-  %tmp39 = phi i32 [ 0, %if.end.thread ], [ %.pre, %if.end ]
-  %tmp40 = phi i32 [ 0, %if.end.thread ], [ %.pre406, %if.end ]
-  %tmp41 = phi i32 [ 0, %if.end.thread ], [ %.pre407, %if.end ]
-  %tmp42 = phi i32 [ 0, %if.end.thread ], [ %.pre408, %if.end ]
-  %tmp43 = phi i32 [ 0, %if.end.thread ], [ %.pre409, %if.end ]
-  %tmp44 = phi i32 [ 0, %if.end.thread ], [ %.pre410, %if.end ]
-  %tmp45 = phi i32 [ 0, %if.end.thread ], [ %.pre411, %if.end ]
-  %tmp46 = phi i32 [ 0, %if.end.thread ], [ %.pre412, %if.end ]
-  %tmp47 = phi i32 [ 0, %if.end.thread ], [ %.pre413, %if.end ]
-  %tmp48 = phi i32 [ 0, %if.end.thread ], [ %.pre414, %if.end ]
-  %tmp49 = phi i32 [ 0, %if.end.thread ], [ %.pre415, %if.end ]
-  %tmp50 = phi i32 [ 0, %if.end.thread ], [ %.pre416, %if.end ]
-  %tmp51 = phi i32 [ 0, %if.end.thread ], [ %.pre417, %if.end ]
-  %tmp52 = phi i32 [ 0, %if.end.thread ], [ %.pre418, %if.end ]
-  %tmp53 = phi i32 [ 0, %if.end.thread ], [ %.pre419, %if.end ]
-  %tmp54 = phi i32 [ 0, %if.end.thread ], [ %.pre420, %if.end ]
-  %tmp55 = phi i32 [ 0, %if.end.thread ], [ %.pre421, %if.end ]
-  %tmp56 = phi i32 [ 0, %if.end.thread ], [ %.pre422, %if.end ]
-  %tmp57 = phi i32 [ 0, %if.end.thread ], [ %.pre423, %if.end ]
-  %save_j3.pre-phi469 = phi i32* [ %save_j, %if.end.thread ], [ %save_j3.phi.trans.insert, %if.end ]
-  %save_t4.pre-phi467 = phi i32* [ %save_t, %if.end.thread ], [ %save_t4.phi.trans.insert, %if.end ]
-  %save_alphaSize5.pre-phi465 = phi i32* [ %save_alphaSize, %if.end.thread ], [ %save_alphaSize5.phi.trans.insert, %if.end ]
-  %save_nGroups6.pre-phi463 = phi i32* [ %save_nGroups, %if.end.thread ], [ %save_nGroups6.phi.trans.insert, %if.end ]
-  %save_nSelectors7.pre-phi461 = phi i32* [ %save_nSelectors, %if.end.thread ], [ %save_nSelectors7.phi.trans.insert, %if.end ]
-  %save_EOB8.pre-phi459 = phi i32* [ %save_EOB, %if.end.thread ], [ %save_EOB8.phi.trans.insert, %if.end ]
-  %save_groupNo9.pre-phi457 = phi i32* [ %save_groupNo, %if.end.thread ], [ %save_groupNo9.phi.trans.insert, %if.end ]
-  %save_groupPos10.pre-phi455 = phi i32* [ %save_groupPos, %if.end.thread ], [ %save_groupPos10.phi.trans.insert, %if.end ]
-  %save_nextSym11.pre-phi453 = phi i32* [ %save_nextSym, %if.end.thread ], [ %save_nextSym11.phi.trans.insert, %if.end ]
-  %save_nblockMAX12.pre-phi451 = phi i32* [ %save_nblockMAX, %if.end.thread ], [ %save_nblockMAX12.phi.trans.insert, %if.end ]
-  %save_nblock13.pre-phi449 = phi i32* [ %save_nblock, %if.end.thread ], [ %save_nblock13.phi.trans.insert, %if.end ]
-  %save_es14.pre-phi447 = phi i32* [ %save_es, %if.end.thread ], [ %save_es14.phi.trans.insert, %if.end ]
-  %save_N15.pre-phi445 = phi i32* [ %save_N, %if.end.thread ], [ %save_N15.phi.trans.insert, %if.end ]
-  %save_curr16.pre-phi443 = phi i32* [ %save_curr, %if.end.thread ], [ %save_curr16.phi.trans.insert, %if.end ]
-  %save_zt17.pre-phi441 = phi i32* [ %save_zt, %if.end.thread ], [ %save_zt17.phi.trans.insert, %if.end ]
-  %save_zn18.pre-phi439 = phi i32* [ %save_zn, %if.end.thread ], [ %save_zn18.phi.trans.insert, %if.end ]
-  %save_zvec19.pre-phi437 = phi i32* [ %save_zvec, %if.end.thread ], [ %save_zvec19.phi.trans.insert, %if.end ]
-  %save_zj20.pre-phi435 = phi i32* [ %save_zj, %if.end.thread ], [ %save_zj20.phi.trans.insert, %if.end ]
-  tail call void @bar(i32 4001)
-  br label %save_state_and_return
-
-save_state_and_return:                            ; preds = %sw.default, %if.end.140, %if.then.130, %if.end.82, %if.end.33, %if.then.29
-  %tmp58 = phi i32 [ %tmp39, %sw.default ], [ %.pre, %if.then.29 ], [ %.pre, %if.then.130 ], [ %.pre, %if.end.140 ], [ %.pre, %if.end.82 ], [ %.pre, %if.end.33 ]
-  %tmp59 = phi i32 [ %tmp40, %sw.default ], [ %.pre406, %if.then.29 ], [ %.pre406, %if.then.130 ], [ %.pre406, %if.end.140 ], [ %.pre406, %if.end.82 ], [ %.pre406, %if.end.33 ]
-  %tmp60 = phi i32 [ %tmp41, %sw.default ], [ %.pre407, %if.then.29 ], [ %.pre407, %if.then.130 ], [ %.pre407, %if.end.140 ], [ %.pre407, %if.end.82 ], [ %.pre407, %if.end.33 ]
-  %tmp61 = phi i32 [ %tmp43, %sw.default ], [ %.pre409, %if.then.29 ], [ %.pre409, %if.then.130 ], [ %.pre409, %if.end.140 ], [ %.pre409, %if.end.82 ], [ %.pre409, %if.end.33 ]
-  %tmp62 = phi i32 [ %tmp44, %sw.default ], [ %.pre410, %if.then.29 ], [ %.pre410, %if.then.130 ], [ %.pre410, %if.end.140 ], [ %.pre410, %if.end.82 ], [ %.pre410, %if.end.33 ]
-  %tmp63 = phi i32 [ %tmp45, %sw.default ], [ %.pre411, %if.then.29 ], [ %.pre411, %if.then.130 ], [ %.pre411, %if.end.140 ], [ %.pre411, %if.end.82 ], [ %.pre411, %if.end.33 ]
-  %tmp64 = phi i32 [ %tmp46, %sw.default ], [ %.pre412, %if.then.29 ], [ %.pre412, %if.then.130 ], [ %.pre412, %if.end.140 ], [ %.pre412, %if.end.82 ], [ %.pre412, %if.end.33 ]
-  %tmp65 = phi i32 [ %tmp47, %sw.default ], [ %.pre413, %if.then.29 ], [ %.pre413, %if.then.130 ], [ %.pre413, %if.end.140 ], [ %.pre413, %if.end.82 ], [ %.pre413, %if.end.33 ]
-  %tmp66 = phi i32 [ %tmp48, %sw.default ], [ %.pre414, %if.then.29 ], [ %.pre414, %if.then.130 ], [ %.pre414, %if.end.140 ], [ %.pre414, %if.end.82 ], [ %.pre414, %if.end.33 ]
-  %tmp67 = phi i32 [ %tmp49, %sw.default ], [ %.pre415, %if.then.29 ], [ %.pre415, %if.then.130 ], [ %.pre415, %if.end.140 ], [ %.pre415, %if.end.82 ], [ %.pre415, %if.end.33 ]
-  %tmp68 = phi i32 [ %tmp51, %sw.default ], [ %.pre417, %if.then.29 ], [ %.pre417, %if.then.130 ], [ %.pre417, %if.end.140 ], [ %.pre417, %if.end.82 ], [ %.pre417, %if.end.33 ]
-  %tmp69 = phi i32 [ %tmp52, %sw.default ], [ %.pre418, %if.then.29 ], [ %.pre418, %if.then.130 ], [ %.pre418, %if.end.140 ], [ %.pre418, %if.end.82 ], [ %.pre418, %if.end.33 ]
-  %tmp70 = phi i32 [ %tmp53, %sw.default ], [ %.pre419, %if.then.29 ], [ %.pre419, %if.then.130 ], [ %.pre419, %if.end.140 ], [ %.pre419, %if.end.82 ], [ %.pre419, %if.end.33 ]
-  %tmp71 = phi i32 [ %tmp54, %sw.default ], [ %.pre420, %if.then.29 ], [ %.pre420, %if.then.130 ], [ %.pre420, %if.end.140 ], [ %.pre420, %if.end.82 ], [ %.pre420, %if.end.33 ]
-  %tmp72 = phi i32 [ %tmp55, %sw.default ], [ %.pre421, %if.then.29 ], [ %.pre421, %if.then.130 ], [ %.pre421, %if.end.140 ], [ %.pre421, %if.end.82 ], [ %.pre421, %if.end.33 ]
-  %tmp73 = phi i32 [ %tmp56, %sw.default ], [ %.pre422, %if.then.29 ], [ %.pre422, %if.then.130 ], [ %.pre422, %if.end.140 ], [ %.pre422, %if.end.82 ], [ %.pre422, %if.end.33 ]
-  %tmp74 = phi i32 [ %tmp57, %sw.default ], [ %.pre423, %if.then.29 ], [ %.pre423, %if.then.130 ], [ %.pre423, %if.end.140 ], [ %.pre423, %if.end.82 ], [ %.pre423, %if.end.33 ]
-  %save_j3.pre-phi468 = phi i32* [ %save_j3.pre-phi469, %sw.default ], [ %save_j3.phi.trans.insert, %if.then.29 ], [ %save_j3.phi.trans.insert, %if.then.130 ], [ %save_j3.phi.trans.insert, %if.end.140 ], [ %save_j3.phi.trans.insert, %if.end.82 ], [ %save_j3.phi.trans.insert, %if.end.33 ]
-  %save_t4.pre-phi466 = phi i32* [ %save_t4.pre-phi467, %sw.default ], [ %save_t4.phi.trans.insert, %if.then.29 ], [ %save_t4.phi.trans.insert, %if.then.130 ], [ %save_t4.phi.trans.insert, %if.end.140 ], [ %save_t4.phi.trans.insert, %if.end.82 ], [ %save_t4.phi.trans.insert, %if.end.33 ]
-  %save_alphaSize5.pre-phi464 = phi i32* [ %save_alphaSize5.pre-phi465, %sw.default ], [ %save_alphaSize5.phi.trans.insert, %if.then.29 ], [ %save_alphaSize5.phi.trans.insert, %if.then.130 ], [ %save_alphaSize5.phi.trans.insert, %if.end.140 ], [ %save_alphaSize5.phi.trans.insert, %if.end.82 ], [ %save_alphaSize5.phi.trans.insert, %if.end.33 ]
-  %save_nGroups6.pre-phi462 = phi i32* [ %save_nGroups6.pre-phi463, %sw.default ], [ %save_nGroups6.phi.trans.insert, %if.then.29 ], [ %save_nGroups6.phi.trans.insert, %if.then.130 ], [ %save_nGroups6.phi.trans.insert, %if.end.140 ], [ %save_nGroups6.phi.trans.insert, %if.end.82 ], [ %save_nGroups6.phi.trans.insert, %if.end.33 ]
-  %save_nSelectors7.pre-phi460 = phi i32* [ %save_nSelectors7.pre-phi461, %sw.default ], [ %save_nSelectors7.phi.trans.insert, %if.then.29 ], [ %save_nSelectors7.phi.trans.insert, %if.then.130 ], [ %save_nSelectors7.phi.trans.insert, %if.end.140 ], [ %save_nSelectors7.phi.trans.insert, %if.end.82 ], [ %save_nSelectors7.phi.trans.insert, %if.end.33 ]
-  %save_EOB8.pre-phi458 = phi i32* [ %save_EOB8.pre-phi459, %sw.default ], [ %save_EOB8.phi.trans.insert, %if.then.29 ], [ %save_EOB8.phi.trans.insert, %if.then.130 ], [ %save_EOB8.phi.trans.insert, %if.end.140 ], [ %save_EOB8.phi.trans.insert, %if.end.82 ], [ %save_EOB8.phi.trans.insert, %if.end.33 ]
-  %save_groupNo9.pre-phi456 = phi i32* [ %save_groupNo9.pre-phi457, %sw.default ], [ %save_groupNo9.phi.trans.insert, %if.then.29 ], [ %save_groupNo9.phi.trans.insert, %if.then.130 ], [ %save_groupNo9.phi.trans.insert, %if.end.140 ], [ %save_groupNo9.phi.trans.insert, %if.end.82 ], [ %save_groupNo9.phi.trans.insert, %if.end.33 ]
-  %save_groupPos10.pre-phi454 = phi i32* [ %save_groupPos10.pre-phi455, %sw.default ], [ %save_groupPos10.phi.trans.insert, %if.then.29 ], [ %save_groupPos10.phi.trans.insert, %if.then.130 ], [ %save_groupPos10.phi.trans.insert, %if.end.140 ], [ %save_groupPos10.phi.trans.insert, %if.end.82 ], [ %save_groupPos10.phi.trans.insert, %if.end.33 ]
-  %save_nextSym11.pre-phi452 = phi i32* [ %save_nextSym11.pre-phi453, %sw.default ], [ %save_nextSym11.phi.trans.insert, %if.then.29 ], [ %save_nextSym11.phi.trans.insert, %if.then.130 ], [ %save_nextSym11.phi.trans.insert, %if.end.140 ], [ %save_nextSym11.phi.trans.insert, %if.end.82 ], [ %save_nextSym11.phi.trans.insert, %if.end.33 ]
-  %save_nblockMAX12.pre-phi450 = phi i32* [ %save_nblockMAX12.pre-phi451, %sw.default ], [ %save_nblockMAX12.phi.trans.insert, %if.then.29 ], [ %save_nblockMAX12.phi.trans.insert, %if.then.130 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.140 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.82 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.33 ]
-  %save_nblock13.pre-phi448 = phi i32* [ %save_nblock13.pre-phi449, %sw.default ], [ %save_nblock13.phi.trans.insert, %if.then.29 ], [ %save_nblock13.phi.trans.insert, %if.then.130 ], [ %save_nblock13.phi.trans.insert, %if.end.140 ], [ %save_nblock13.phi.trans.insert, %if.end.82 ], [ %save_nblock13.phi.trans.insert, %if.end.33 ]
-  %save_es14.pre-phi446 = phi i32* [ %save_es14.pre-phi447, %sw.default ], [ %save_es14.phi.trans.insert, %if.then.29 ], [ %save_es14.phi.trans.insert, %if.then.130 ], [ %save_es14.phi.trans.insert, %if.end.140 ], [ %save_es14.phi.trans.insert, %if.end.82 ], [ %save_es14.phi.trans.insert, %if.end.33 ]
-  %save_N15.pre-phi444 = phi i32* [ %save_N15.pre-phi445, %sw.default ], [ %save_N15.phi.trans.insert, %if.then.29 ], [ %save_N15.phi.trans.insert, %if.then.130 ], [ %save_N15.phi.trans.insert, %if.end.140 ], [ %save_N15.phi.trans.insert, %if.end.82 ], [ %save_N15.phi.trans.insert, %if.end.33 ]
-  %save_curr16.pre-phi442 = phi i32* [ %save_curr16.pre-phi443, %sw.default ], [ %save_curr16.phi.trans.insert, %if.then.29 ], [ %save_curr16.phi.trans.insert, %if.then.130 ], [ %save_curr16.phi.trans.insert, %if.end.140 ], [ %save_curr16.phi.trans.insert, %if.end.82 ], [ %save_curr16.phi.trans.insert, %if.end.33 ]
-  %save_zt17.pre-phi440 = phi i32* [ %save_zt17.pre-phi441, %sw.default ], [ %save_zt17.phi.trans.insert, %if.then.29 ], [ %save_zt17.phi.trans.insert, %if.then.130 ], [ %save_zt17.phi.trans.insert, %if.end.140 ], [ %save_zt17.phi.trans.insert, %if.end.82 ], [ %save_zt17.phi.trans.insert, %if.end.33 ]
-  %save_zn18.pre-phi438 = phi i32* [ %save_zn18.pre-phi439, %sw.default ], [ %save_zn18.phi.trans.insert, %if.then.29 ], [ %save_zn18.phi.trans.insert, %if.then.130 ], [ %save_zn18.phi.trans.insert, %if.end.140 ], [ %save_zn18.phi.trans.insert, %if.end.82 ], [ %save_zn18.phi.trans.insert, %if.end.33 ]
-  %save_zvec19.pre-phi436 = phi i32* [ %save_zvec19.pre-phi437, %sw.default ], [ %save_zvec19.phi.trans.insert, %if.then.29 ], [ %save_zvec19.phi.trans.insert, %if.then.130 ], [ %save_zvec19.phi.trans.insert, %if.end.140 ], [ %save_zvec19.phi.trans.insert, %if.end.82 ], [ %save_zvec19.phi.trans.insert, %if.end.33 ]
-  %save_zj20.pre-phi434 = phi i32* [ %save_zj20.pre-phi435, %sw.default ], [ %save_zj20.phi.trans.insert, %if.then.29 ], [ %save_zj20.phi.trans.insert, %if.then.130 ], [ %save_zj20.phi.trans.insert, %if.end.140 ], [ %save_zj20.phi.trans.insert, %if.end.82 ], [ %save_zj20.phi.trans.insert, %if.end.33 ]
-  %nblock.1 = phi i32 [ %tmp50, %sw.default ], [ %.pre416, %if.then.29 ], [ 0, %if.then.130 ], [ %.pre416, %if.end.140 ], [ %.pre416, %if.end.82 ], [ %.pre416, %if.end.33 ]
-  %alphaSize.1 = phi i32 [ %tmp42, %sw.default ], [ %.pre408, %if.then.29 ], [ %add179, %if.then.130 ], [ %.pre408, %if.end.140 ], [ %.pre408, %if.end.82 ], [ %.pre408, %if.end.33 ]
-  %retVal.0 = phi i32 [ 0, %sw.default ], [ -5, %if.then.29 ], [ -4, %if.then.130 ], [ 0, %if.end.140 ], [ 0, %if.end.82 ], [ 0, %if.end.33 ]
-  store i32 %tmp58, i32* %save_i, align 4
-  store i32 %tmp59, i32* %save_j3.pre-phi468, align 4
-  store i32 %tmp60, i32* %save_t4.pre-phi466, align 4
-  store i32 %alphaSize.1, i32* %save_alphaSize5.pre-phi464, align 4
-  store i32 %tmp61, i32* %save_nGroups6.pre-phi462, align 4
-  store i32 %tmp62, i32* %save_nSelectors7.pre-phi460, align 4
-  store i32 %tmp63, i32* %save_EOB8.pre-phi458, align 4
-  store i32 %tmp64, i32* %save_groupNo9.pre-phi456, align 4
-  store i32 %tmp65, i32* %save_groupPos10.pre-phi454, align 4
-  store i32 %tmp66, i32* %save_nextSym11.pre-phi452, align 4
-  store i32 %tmp67, i32* %save_nblockMAX12.pre-phi450, align 4
-  store i32 %nblock.1, i32* %save_nblock13.pre-phi448, align 4
-  store i32 %tmp68, i32* %save_es14.pre-phi446, align 4
-  store i32 %tmp69, i32* %save_N15.pre-phi444, align 4
-  store i32 %tmp70, i32* %save_curr16.pre-phi442, align 4
-  store i32 %tmp71, i32* %save_zt17.pre-phi440, align 4
-  store i32 %tmp72, i32* %save_zn18.pre-phi438, align 4
-  store i32 %tmp73, i32* %save_zvec19.pre-phi436, align 4
-  store i32 %tmp74, i32* %save_zj20.pre-phi434, align 4
-  ret i32 %retVal.0
-}
-
-!0 = !{!"branch_weights", i32 10, i32 1}

Modified: llvm/trunk/test/CodeGen/X86/fp128-compare.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fp128-compare.ll?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fp128-compare.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fp128-compare.ll Wed Apr  6 10:41:07 2016
@@ -86,8 +86,8 @@ entry:
   %cond = select i1 %cmp, fp128 %x, fp128 %y
   ret fp128 %cond
 ; CHECK-LABEL: TestMax:
-; CHECK: movaps %xmm1
 ; CHECK: movaps %xmm0
+; CHECK: movaps %xmm1
 ; CHECK: callq __gttf2
 ; CHECK: movaps {{.*}}, %xmm0
 ; CHECK: testl %eax, %eax

Added: llvm/trunk/test/CodeGen/X86/hoist-spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/hoist-spill.ll?rev=265547&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/hoist-spill.ll (added)
+++ llvm/trunk/test/CodeGen/X86/hoist-spill.ll Wed Apr  6 10:41:07 2016
@@ -0,0 +1,121 @@
+; RUN: llc < %s | FileCheck %s
+
+; grep 'Spill' |sed 's%.*\(-[0-9]\+(\%rsp)\).*%\1%g' |sort |uniq -d |awk '{if (/rsp/); exit -1}'
+; Check no spills to the same stack slot after hoisting.
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET3:-?[0-9]*]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET3]](%rsp)
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at a = external global i32*, align 8
+ at b = external global i32, align 4
+ at d = external global i32*, align 8
+
+; Function Attrs: norecurse noreturn nounwind uwtable
+define void @fn1(i32 %p1) {
+entry:
+  %tmp = load i32*, i32** @d, align 8
+  %tmp1 = load i32*, i32** @a, align 8
+  %tmp2 = sext i32 %p1 to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc14, %entry
+  %indvar = phi i32 [ %indvar.next, %for.inc14 ], [ 0, %entry ]
+  %indvars.iv30.in = phi i32 [ %indvars.iv30, %for.inc14 ], [ %p1, %entry ]
+  %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ]
+  %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ]
+  %tmp3 = icmp sgt i32 undef, 0
+  %smax52 = select i1 %tmp3, i32 undef, i32 0
+  %tmp4 = zext i32 %smax52 to i64
+  %tmp5 = icmp sgt i64 undef, %tmp4
+  %smax53 = select i1 %tmp5, i64 undef, i64 %tmp4
+  %tmp6 = add nsw i64 %smax53, 1
+  %tmp7 = sub nsw i64 %tmp6, %tmp4
+  %tmp8 = add nsw i64 %tmp7, -8
+  %tmp9 = sub i32 undef, %indvar
+  %tmp10 = icmp sgt i64 %tmp2, 0
+  %smax40 = select i1 %tmp10, i64 %tmp2, i64 0
+  %scevgep41 = getelementptr i32, i32* %tmp1, i64 %smax40
+  %indvars.iv30 = add i32 %indvars.iv30.in, -1
+  %tmp11 = icmp sgt i32 %indvars.iv30, 0
+  %smax = select i1 %tmp11, i32 %indvars.iv30, i32 0
+  %tmp12 = zext i32 %smax to i64
+  %sub = sub nsw i32 %p1, %c.0
+  %cmp = icmp sgt i32 %sub, 0
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  %cmp326 = icmp sgt i32 %k.0, %p1
+  br i1 %cmp326, label %for.cond4.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond
+  br label %for.body
+
+for.cond4.preheader:                              ; preds = %for.body, %for.cond
+  %k.1.lcssa = phi i32 [ %k.0, %for.cond ], [ %add, %for.body ]
+  %cmp528 = icmp sgt i32 %sub., %p1
+  br i1 %cmp528, label %for.inc14, label %for.body6.preheader
+
+for.body6.preheader:                              ; preds = %for.cond4.preheader
+  br i1 undef, label %for.body6, label %min.iters.checked
+
+min.iters.checked:                                ; preds = %for.body6.preheader
+  br i1 undef, label %for.body6, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %min.iters.checked
+  %bound1 = icmp ule i32* undef, %scevgep41
+  %memcheck.conflict = and i1 undef, %bound1
+  br i1 %memcheck.conflict, label %for.body6, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  %lcmp.mod = icmp eq i64 undef, 0
+  br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.prol:                                 ; preds = %vector.body.prol, %vector.body.preheader
+  %prol.iter.cmp = icmp eq i64 undef, 0
+  br i1 %prol.iter.cmp, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.preheader.split:                      ; preds = %vector.body.prol, %vector.body.preheader
+  %tmp13 = icmp ult i64 %tmp8, 24
+  br i1 %tmp13, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.body.preheader.split
+  %index = phi i64 [ %index.next.3, %vector.body ], [ 0, %vector.body.preheader.split ]
+  %index.next = add i64 %index, 8
+  %offset.idx.1 = add i64 %tmp12, %index.next
+  %tmp14 = getelementptr inbounds i32, i32* %tmp, i64 %offset.idx.1
+  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+  %wide.load.1 = load <4 x i32>, <4 x i32>* %tmp15, align 4
+  %tmp16 = getelementptr inbounds i32, i32* %tmp1, i64 %offset.idx.1
+  %tmp17 = bitcast i32* %tmp16 to <4 x i32>*
+  store <4 x i32> %wide.load.1, <4 x i32>* %tmp17, align 4
+  %index.next.3 = add i64 %index, 32
+  br i1 undef, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %vector.body.preheader.split
+  br i1 undef, label %for.inc14, label %for.body6
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %k.127 = phi i32 [ %k.0, %for.body.preheader ], [ %add, %for.body ]
+  %add = add nsw i32 %k.127, 1
+  %tmp18 = load i32, i32* undef, align 4
+  store i32 %tmp18, i32* @b, align 4
+  br i1 undef, label %for.body, label %for.cond4.preheader
+
+for.body6:                                        ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
+  %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
+  %arrayidx8 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv32
+  %tmp19 = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %tmp1, i64 %indvars.iv32
+  store i32 %tmp19, i32* %arrayidx10, align 4
+  %cmp5 = icmp slt i64 %indvars.iv32, undef
+  br i1 %cmp5, label %for.body6, label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6, %middle.block, %for.cond4.preheader
+  %inc15 = add nuw nsw i32 %c.0, 1
+  %indvar.next = add i32 %indvar, 1
+  br label %for.cond
+}

Added: llvm/trunk/test/CodeGen/X86/new-remat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/new-remat.ll?rev=265547&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/new-remat.ll (added)
+++ llvm/trunk/test/CodeGen/X86/new-remat.ll Wed Apr  6 10:41:07 2016
@@ -0,0 +1,70 @@
+; RUN: llc < %s | FileCheck %s
+; Check all spills are rematerialized.
+; CHECK-NOT: Spill
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at b = common global double 0.000000e+00, align 8
+ at a = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define i32 @uniform_testdata(i32 %p1) {
+entry:
+  %cmp3 = icmp sgt i32 %p1, 0
+  br i1 %cmp3, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = add i32 %p1, -1
+  %xtraiter = and i32 %p1, 7
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.body.preheader.split, label %for.body.prol.preheader
+
+for.body.prol.preheader:                          ; preds = %for.body.preheader
+  br label %for.body.prol
+
+for.body.prol:                                    ; preds = %for.body.prol, %for.body.prol.preheader
+  %i.04.prol = phi i32 [ %inc.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
+  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.prol.preheader ]
+  %tmp1 = load double, double* @b, align 8
+  %call.prol = tail call double @pow(double %tmp1, double 2.500000e-01)
+  %inc.prol = add nuw nsw i32 %i.04.prol, 1
+  %prol.iter.sub = add i32 %prol.iter, -1
+  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
+  br i1 %prol.iter.cmp, label %for.body.preheader.split.loopexit, label %for.body.prol
+
+for.body.preheader.split.loopexit:                ; preds = %for.body.prol
+  %inc.prol.lcssa = phi i32 [ %inc.prol, %for.body.prol ]
+  br label %for.body.preheader.split
+
+for.body.preheader.split:                         ; preds = %for.body.preheader.split.loopexit, %for.body.preheader
+  %i.04.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.prol.lcssa, %for.body.preheader.split.loopexit ]
+  %tmp2 = icmp ult i32 %tmp, 7
+  br i1 %tmp2, label %for.end.loopexit, label %for.body.preheader.split.split
+
+for.body.preheader.split.split:                   ; preds = %for.body.preheader.split
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.split.split
+  %i.04 = phi i32 [ %i.04.unr, %for.body.preheader.split.split ], [ %inc.7, %for.body ]
+  %tmp3 = load double, double* @b, align 8
+  %call = tail call double @pow(double %tmp3, double 2.500000e-01)
+  %tmp4 = load double, double* @b, align 8
+  %call.1 = tail call double @pow(double %tmp4, double 2.500000e-01)
+  %inc.7 = add nsw i32 %i.04, 8
+  %exitcond.7 = icmp eq i32 %inc.7, %p1
+  br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body
+
+for.end.loopexit.unr-lcssa:                       ; preds = %for.body
+  br label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.end.loopexit.unr-lcssa, %for.body.preheader.split
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %tmp5 = load i32, i32* @a, align 4
+  ret i32 %tmp5
+}
+
+; Function Attrs: nounwind
+declare double @pow(double, double)

Modified: llvm/trunk/test/CodeGen/X86/ragreedy-hoist-spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ragreedy-hoist-spill.ll?rev=265547&r1=265546&r2=265547&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/ragreedy-hoist-spill.ll (original)
+++ llvm/trunk/test/CodeGen/X86/ragreedy-hoist-spill.ll Wed Apr  6 10:41:07 2016
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
 
 ; This testing case is reduced from 254.gap SyFgets function.
-; We make sure a spill is not hoisted to a hotter outer loop.
+; We make sure a spill is hoisted to a cold BB inside the hotter outer loop.
 
 %struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
 %struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 }
@@ -181,6 +181,10 @@ sw.bb474:
   br i1 %cmp476, label %if.end517, label %do.body479.preheader
 
 do.body479.preheader:
+  ; CHECK: do.body479.preheader
+  ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold.
+  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; CHECK: land.rhs485
   %cmp4833314 = icmp eq i8 undef, 0
   br i1 %cmp4833314, label %if.end517, label %land.rhs485
 
@@ -200,8 +204,8 @@ land.lhs.true490:
 
 lor.rhs500:
   ; CHECK: lor.rhs500
-  ; Make sure that we don't hoist the spill to outer loops.
-  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; Make sure spill is hoisted to a cold preheader in outside loop.
+  ; CHECK-NOT: movq %r{{.*}}, {{[0-9]+}}(%rsp)
   ; CHECK: callq {{.*}}maskrune
   %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
   br i1 undef, label %land.lhs.true504, label %do.body479.backedge




More information about the llvm-commits mailing list