[llvm] r277480 - Rewrite the use optimizer to be less memory intensive and 50% faster.

Tue Aug 2 09:24:03 PDT 2016

Author: dannyb
Date: Tue Aug  2 11:24:03 2016
New Revision: 277480

URL: http://llvm.org/viewvc/llvm-project?rev=277480&view=rev
Log:
Rewrite the use optimizer to be less memory intensive and 50% faster.
Fixes PR28670

Summary:
Rewrite the use optimizer to be less memory intensive and 50% faster.
Fixes PR28670

The new use optimizer works like a standard SSA renaming pass, storing
all possible versions a MemorySSA use could get in a stack, and just
tracking indexes into the stack.
This uses much less memory than caching N^2 alias query results.
It's also a lot faster.

The current version defers phi node walking to the normal walker.

Reviewers: george.burgess.iv

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D23032

Modified:
    llvm/trunk/include/llvm/Transforms/Utils/MemorySSA.h
    llvm/trunk/lib/Transforms/Utils/MemorySSA.cpp

Modified: llvm/trunk/include/llvm/Transforms/Utils/MemorySSA.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Utils/MemorySSA.h?rev=277480&r1=277479&r2=277480&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Transforms/Utils/MemorySSA.h (original)
+++ llvm/trunk/include/llvm/Transforms/Utils/MemorySSA.h Tue Aug  2 11:24:03 2016
@@ -530,8 +530,7 @@ public:
   ///
   /// This list is not modifiable by the user.
   const AccessList *getBlockAccesses(const BasicBlock *BB) const {
-    auto It = PerBlockAccesses.find(BB);
-    return It == PerBlockAccesses.end() ? nullptr : It->second.get();
+    return getWritableBlockAccesses(BB);
   }
 
   /// \brief Create an empty MemoryPhi in MemorySSA for a given basic block.
@@ -602,11 +601,20 @@ protected:
   void verifyDomination(Function &F) const;
   void verifyOrdering(Function &F) const;
 
+  // This is used by the use optimizer class
+  AccessList *getWritableBlockAccesses(const BasicBlock *BB) const {
+    auto It = PerBlockAccesses.find(BB);
+    return It == PerBlockAccesses.end() ? nullptr : It->second.get();
+  }
+
 private:
   class CachingWalker;
+  class OptimizeUses;
 
   CachingWalker *getWalkerImpl();
   void buildMemorySSA();
+  void optimizeUses();
+
   void verifyUseInDefs(MemoryAccess *, MemoryAccess *) const;
   using AccessMap = DenseMap<const BasicBlock *, std::unique_ptr<AccessList>>;
 

Modified: llvm/trunk/lib/Transforms/Utils/MemorySSA.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/MemorySSA.cpp?rev=277480&r1=277479&r2=277480&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Utils/MemorySSA.cpp (original)
+++ llvm/trunk/lib/Transforms/Utils/MemorySSA.cpp Tue Aug  2 11:24:03 2016
@@ -61,6 +61,11 @@ INITIALIZE_PASS_DEPENDENCY(MemorySSAWrap
 INITIALIZE_PASS_END(MemorySSAPrinterLegacyPass, "print-memoryssa",
                     "Memory SSA Printer", false, false)
 
+static cl::opt<unsigned> MaxCheckLimit(
+    "memssa-check-limit", cl::Hidden, cl::init(100),
+    cl::desc("The maximum number of stores/phis MemorySSA"
+             "will consider trying to walk past (default = 100)"));
+
 static cl::opt<bool>
     VerifyMemorySSA("verify-memoryssa", cl::init(false), cl::Hidden,
                     cl::desc("Verify MemorySSA in legacy printer pass."));
@@ -111,16 +116,17 @@ struct UpwardsMemoryQuery {
   }
 };
 
-static bool instructionClobbersQuery(MemoryDef *MD, const MemoryLocation &Loc,
-                                     const UpwardsMemoryQuery &Query,
+static bool instructionClobbersQuery(MemoryDef *MD,
+                                     const MemoryLocation &UseLoc,
+                                     const Instruction *UseInst,
                                      AliasAnalysis &AA) {
-  Instruction *DefMemoryInst = MD->getMemoryInst();
-  assert(DefMemoryInst && "Defining instruction not actually an instruction");
-
-  if (!Query.IsCall)
-    return AA.getModRefInfo(DefMemoryInst, Loc) & MRI_Mod;
+  Instruction *DefInst = MD->getMemoryInst();
+  assert(DefInst && "Defining instruction not actually an instruction");
+  ImmutableCallSite UseCS(UseInst);
+  if (!UseCS)
+    return AA.getModRefInfo(DefInst, UseLoc) & MRI_Mod;
 
-  ModRefInfo I = AA.getModRefInfo(DefMemoryInst, ImmutableCallSite(Query.Inst));
+  ModRefInfo I = AA.getModRefInfo(DefInst, UseCS);
   return I != MRI_NoModRef;
 }
 
@@ -257,8 +263,9 @@ checkClobberSanity(MemoryAccess *Start,
           //
           // Also, note that this can't be hoisted out of the `Worklist` loop,
           // since MD may only act as a clobber for 1 of N MemoryLocations.
-          FoundClobber = FoundClobber || MSSA.isLiveOnEntryDef(MD) ||
-                         instructionClobbersQuery(MD, MAP.second, Query, AA);
+          FoundClobber =
+              FoundClobber || MSSA.isLiveOnEntryDef(MD) ||
+              instructionClobbersQuery(MD, MAP.second, Query.Inst, AA);
         }
         break;
       }
@@ -268,7 +275,7 @@ checkClobberSanity(MemoryAccess *Start,
 
       if (auto *MD = dyn_cast<MemoryDef>(MA)) {
         (void)MD;
-        assert(!instructionClobbersQuery(MD, MAP.second, Query, AA) &&
+        assert(!instructionClobbersQuery(MD, MAP.second, Query.Inst, AA) &&
                "Found clobber before reaching ClobberAt!");
         continue;
       }
@@ -428,7 +435,7 @@ class ClobberWalker {
 
       if (auto *MD = dyn_cast<MemoryDef>(Current))
         if (MSSA.isLiveOnEntryDef(MD) ||
-            instructionClobbersQuery(MD, Desc.Loc, *Query, AA))
+            instructionClobbersQuery(MD, Desc.Loc, Query->Inst, AA))
           return {MD, true, false};
 
       // Cache checks must be done last, because if Current is a clobber, the
@@ -1065,6 +1072,297 @@ MemorySSA::AccessList *MemorySSA::getOrC
   return Res.first->second.get();
 }
 
+/// Our current alias analysis API differentiates heavily between calls and
+/// non-calls, and functions called on one usually assert on the other.
+/// This class encapsulates the distinction to simplify other code that wants
+/// "Memory affecting instructions and related data" to use as a key.
+/// For example, this class is used as a densemap key in the use optimizer.
+class MemoryLocOrCall {
+public:
+  MemoryLocOrCall() : IsCall(false) {}
+  MemoryLocOrCall(MemoryUseOrDef *MUD)
+      : MemoryLocOrCall(MUD->getMemoryInst()) {}
+
+  MemoryLocOrCall(Instruction *Inst) {
+    if (ImmutableCallSite(Inst)) {
+      IsCall = true;
+      CS = ImmutableCallSite(Inst);
+    } else {
+      IsCall = false;
+      // There is no such thing as a memorylocation for a fence inst, and it is
+      // unique in that regard.
+      if (!isa<FenceInst>(Inst))
+        Loc = MemoryLocation::get(Inst);
+    }
+  }
+
+  explicit MemoryLocOrCall(MemoryLocation Loc) : IsCall(false), Loc(Loc) {}
+
+  bool IsCall;
+  ImmutableCallSite getCS() const {
+    assert(IsCall);
+    return CS;
+  }
+  MemoryLocation getLoc() const {
+    assert(!IsCall);
+    return Loc;
+  }
+  bool operator==(const MemoryLocOrCall &Other) const {
+    if (IsCall != Other.IsCall)
+      return false;
+
+    if (IsCall)
+      return CS.getCalledValue() == Other.CS.getCalledValue();
+    else
+      return Loc == Other.Loc;
+  }
+
+private:
+  union {
+    ImmutableCallSite CS;
+    MemoryLocation Loc;
+  };
+};
+
+template <> struct DenseMapInfo<MemoryLocOrCall> {
+  static inline MemoryLocOrCall getEmptyKey() {
+    return MemoryLocOrCall(DenseMapInfo<MemoryLocation>::getEmptyKey());
+  }
+  static inline MemoryLocOrCall getTombstoneKey() {
+    return MemoryLocOrCall(DenseMapInfo<MemoryLocation>::getTombstoneKey());
+  }
+  static unsigned getHashValue(const MemoryLocOrCall &MLOC) {
+    if (MLOC.IsCall)
+      return hash_combine(MLOC.IsCall,
+                          DenseMapInfo<const Value *>::getHashValue(
+                              MLOC.getCS().getCalledValue()));
+    else
+      return hash_combine(
+          MLOC.IsCall,
+          DenseMapInfo<MemoryLocation>::getHashValue(MLOC.getLoc()));
+  }
+  static bool isEqual(const MemoryLocOrCall &LHS, const MemoryLocOrCall &RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// This class is a batch walker of all MemoryUse's in the program, and points
+/// their defining access at the thing that actually clobbers them.  Because it
+/// is a batch walker that touches everything, it does not operate like the
+/// other walkers.  This walker is basically performing a top-down SSA renaming
+/// pass, where the version stack is used as the cache.  This enables it to be
+/// significantly more time and memory efficient than using the regular walker,
+/// which is walking bottom-up.
+class MemorySSA::OptimizeUses {
+public:
+  OptimizeUses(MemorySSA *MSSA, MemorySSAWalker *Walker, AliasAnalysis *AA,
+               DominatorTree *DT)
+      : MSSA(MSSA), Walker(Walker), AA(AA), DT(DT) {
+    Walker = MSSA->getWalker();
+  }
+
+  void optimizeUses();
+
+private:
+  /// This represents where a given memorylocation is in the stack.
+  struct MemlocStackInfo {
+    // This essentially is keeping track of versions of the stack. Whenever
+    // the stack changes due to pushes or pops, these versions increase.
+    unsigned long StackEpoch;
+    unsigned long PopEpoch;
+    // This is the lower bound of places on the stack to check. It is equal to
+    // the place the last stack walk ended.
+    // Note: Correctness depends on this being initialized to 0, which densemap
+    // does
+    unsigned long LowerBound;
+    // This is where the last walk for this memory location ended.
+    unsigned long LastKill;
+    bool LastKillValid;
+  };
+  void optimizeUsesInBlock(const BasicBlock *, unsigned long &, unsigned long &,
+                           SmallVectorImpl<MemoryAccess *> &,
+                           DenseMap<MemoryLocOrCall, MemlocStackInfo> &);
+  MemorySSA *MSSA;
+  MemorySSAWalker *Walker;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+};
+
+static bool instructionClobbersQuery(MemoryDef *MD,
+                                     const MemoryLocOrCall &UseMLOC,
+                                     AliasAnalysis &AA) {
+  Instruction *DefInst = MD->getMemoryInst();
+  assert(DefInst && "Defining instruction not actually an instruction");
+  if (!UseMLOC.IsCall)
+    return AA.getModRefInfo(DefInst, UseMLOC.getLoc()) & MRI_Mod;
+
+  ModRefInfo I = AA.getModRefInfo(DefInst, UseMLOC.getCS());
+  return I != MRI_NoModRef;
+}
+
+/// Optimize the uses in a given block This is basically the SSA renaming
+/// algorithm, with one caveat: We are able to use a single stack for all
+/// MemoryUses.  This is because the set of *possible* reaching MemoryDefs is
+/// the same for every MemoryUse.  The *actual* clobbering MemoryDef is just
+/// going to be some position in that stack of possible ones.
+///
+/// We track the stack positions that each MemoryLocation needs
+/// to check, and last ended at.  This is because we only want to check the
+/// things that changed since last time.  The same MemoryLocation should
+/// get clobbered by the same store (getModRefInfo does not use invariantness or
+/// things like this, and if they start, we can modify MemoryLocOrCall to
+/// include relevant data)
+void MemorySSA::OptimizeUses::optimizeUsesInBlock(
+    const BasicBlock *BB, unsigned long &StackEpoch, unsigned long &PopEpoch,
+    SmallVectorImpl<MemoryAccess *> &VersionStack,
+    DenseMap<MemoryLocOrCall, MemlocStackInfo> &LocStackInfo) {
+
+  /// If no accesses, nothing to do.
+  MemorySSA::AccessList *Accesses = MSSA->getWritableBlockAccesses(BB);
+  if (Accesses == nullptr)
+    return;
+
+  // Pop everything that doesn't dominate the current block off the stack,
+  // increment the PopEpoch to account for this.
+  while (!VersionStack.empty()) {
+    BasicBlock *BackBlock = VersionStack.back()->getBlock();
+    if (DT->dominates(BackBlock, BB))
+      break;
+    while (VersionStack.back()->getBlock() == BackBlock)
+      VersionStack.pop_back();
+    ++PopEpoch;
+  }
+
+  for (MemoryAccess &MA : *Accesses) {
+    auto *MU = dyn_cast<MemoryUse>(&MA);
+    if (!MU) {
+      VersionStack.push_back(&MA);
+      ++StackEpoch;
+      continue;
+    }
+
+    MemoryLocOrCall UseMLOC(MU);
+    auto &LocInfo = LocStackInfo[UseMLOC];
+    // If the pop epoch changed, if means we've removed stuff from top of
+    // stack due to changing blocks. We may have to reset the lower bound or
+    // last kill info.
+    if (LocInfo.PopEpoch != PopEpoch) {
+      LocInfo.PopEpoch = PopEpoch;
+      LocInfo.StackEpoch = StackEpoch;
+      // If the lower bound was in the info we popped, we have to reset it.
+      if (LocInfo.LowerBound >= VersionStack.size()) {
+        // Reset the lower bound of things to check.
+        // TODO: Some day we should be able to reset to last kill, rather than
+        // 0.
+
+        LocInfo.LowerBound = 0;
+        LocInfo.LastKillValid = false;
+      }
+    } else if (LocInfo.StackEpoch != StackEpoch) {
+      // If all that has changed is the StackEpoch, we only have to check the
+      // new things on the stack, because we've checked everything before.  In
+      // this case, the lower bound of things to check remains the same.
+      LocInfo.PopEpoch = PopEpoch;
+      LocInfo.StackEpoch = StackEpoch;
+    }
+    if (!LocInfo.LastKillValid) {
+      LocInfo.LastKill = VersionStack.size() - 1;
+      LocInfo.LastKillValid = true;
+    }
+
+    // At this point, we should have corrected last kill and LowerBound to be
+    // in bounds.
+    assert(LocInfo.LowerBound < VersionStack.size() &&
+           "Lower bound out of range");
+    assert(LocInfo.LastKill < VersionStack.size() &&
+           "Last kill info out of range");
+    // In any case, the new upper bound is the top of the stack.
+    unsigned long UpperBound = VersionStack.size() - 1;
+
+    if (UpperBound - LocInfo.LowerBound > MaxCheckLimit) {
+      DEBUG(dbgs() << "We are being asked to check up to "
+                   << UpperBound - LocInfo.LowerBound
+                   << " loads and stores, so we didn't.\n");
+      // Because we did not walk, LastKill is no longer valid, as this may
+      // have been a kill.
+      LocInfo.LastKillValid = false;
+      continue;
+    }
+    bool FoundClobberResult = false;
+    while (UpperBound > LocInfo.LowerBound) {
+      if (isa<MemoryPhi>(VersionStack[UpperBound])) {
+        // For phis, use the walker, see where we ended up, go there
+        Instruction *UseInst = MU->getMemoryInst();
+        MemoryAccess *Result = Walker->getClobberingMemoryAccess(UseInst);
+        // We are guaranteed to find it or something is wrong
+        while (VersionStack[UpperBound] != Result) {
+          assert(UpperBound != 0);
+          --UpperBound;
+        }
+        FoundClobberResult = true;
+        break;
+      }
+
+      MemoryDef *MD = cast<MemoryDef>(VersionStack[UpperBound]);
+
+      if (instructionClobbersQuery(MD, UseMLOC, *AA)) {
+        FoundClobberResult = true;
+        break;
+      }
+      --UpperBound;
+    }
+    // At the end of this loop, UpperBound is either a clobber, or lower bound
+    // PHI walking may cause it to be < LowerBound, and in fact, < LastKill.
+    if (FoundClobberResult || UpperBound < LocInfo.LastKill) {
+      MU->setDefiningAccess(VersionStack[UpperBound]);
+      // We were last killed now by where we got to
+      LocInfo.LastKill = UpperBound;
+    } else {
+      // Otherwise, we checked all the new ones, and now we know we can get to
+      // LastKill.
+      MU->setDefiningAccess(VersionStack[LocInfo.LastKill]);
+    }
+    LocInfo.LowerBound = VersionStack.size() - 1;
+  }
+}
+
+/// Optimize uses to point to their actual clobbering definitions.
+void MemorySSA::OptimizeUses::optimizeUses() {
+
+  // We perform a non-recursive top-down dominator tree walk
+  struct StackInfo {
+    const DomTreeNode *Node;
+    DomTreeNode::const_iterator Iter;
+  };
+
+  SmallVector<MemoryAccess *, 16> VersionStack;
+  SmallVector<StackInfo, 16> DomTreeWorklist;
+  DenseMap<MemoryLocOrCall, MemlocStackInfo> LocStackInfo;
+  DomTreeWorklist.push_back({DT->getRootNode(), DT->getRootNode()->begin()});
+  // Bottom of the version stack is always live on entry.
+  VersionStack.push_back(MSSA->getLiveOnEntryDef());
+
+  unsigned long StackEpoch = 1;
+  unsigned long PopEpoch = 1;
+  while (!DomTreeWorklist.empty()) {
+    const auto *DomNode = DomTreeWorklist.back().Node;
+    const auto DomIter = DomTreeWorklist.back().Iter;
+    BasicBlock *BB = DomNode->getBlock();
+    optimizeUsesInBlock(BB, StackEpoch, PopEpoch, VersionStack, LocStackInfo);
+    if (DomIter == DomNode->end()) {
+      // Hit the end, pop the worklist
+      DomTreeWorklist.pop_back();
+      continue;
+    }
+    // Move the iterator to the next child for the next time we get to process
+    // children
+    ++DomTreeWorklist.back().Iter;
+
+    // Now visit the next child
+    DomTreeWorklist.push_back({*DomIter, (*DomIter)->begin()});
+  }
+}
+
 void MemorySSA::buildMemorySSA() {
   // We create an access to represent "live on entry", for things like
   // arguments or users of globals, where the memory they use is defined before
@@ -1161,25 +1459,7 @@ void MemorySSA::buildMemorySSA() {
 
   // We're doing a batch of updates; don't drop useful caches between them.
   Walker->setAutoResetWalker(false);
-
-  // Now optimize the MemoryUse's defining access to point to the nearest
-  // dominating clobbering def.
-  // This ensures that MemoryUse's that are killed by the same store are
-  // immediate users of that store, one of the invariants we guarantee.
-  for (auto DomNode : depth_first(DT)) {
-    BasicBlock *BB = DomNode->getBlock();
-    auto AI = PerBlockAccesses.find(BB);
-    if (AI == PerBlockAccesses.end())
-      continue;
-    AccessList *Accesses = AI->second.get();
-    for (auto &MA : *Accesses) {
-      if (auto *MU = dyn_cast<MemoryUse>(&MA)) {
-        Instruction *Inst = MU->getMemoryInst();
-        MU->setDefiningAccess(Walker->getClobberingMemoryAccess(Inst));
-      }
-    }
-  }
-
+  OptimizeUses(this, Walker, AA, DT).optimizeUses();
   Walker->setAutoResetWalker(true);
   Walker->resetClobberWalker();