[llvm] r346269 - [MachineOutliner][NFC] Add findRepeatedSubstrings to SuffixTree, kill LeafVector

Jessica Paquette via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 6 13:46:41 PST 2018


Author: paquette
Date: Tue Nov  6 13:46:41 2018
New Revision: 346269

URL: http://llvm.org/viewvc/llvm-project?rev=346269&view=rev
Log:
[MachineOutliner][NFC] Add findRepeatedSubstrings to SuffixTree, kill LeafVector

Instead of iterating over the leaves to find repeated substrings, and walking
collecting leaf children when we don't necessarily need them, let's just
calculate what we need and iterate over that.

By doing this, we don't have to save every leaf. It's easier to read the code
too and understand what's going on.

The goal here, at the end of the day, is to set up to allow us to do something
like

for (RepeatedSubstring &RS : ST) {
 ... do stuff with RS ...
}

Which would let us perform the cost model stuff and the repeated substring
query at the same time.

Modified:
    llvm/trunk/lib/CodeGen/MachineOutliner.cpp

Modified: llvm/trunk/lib/CodeGen/MachineOutliner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineOutliner.cpp?rev=346269&r1=346268&r2=346269&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/MachineOutliner.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineOutliner.cpp Tue Nov  6 13:46:41 2018
@@ -231,14 +231,18 @@ struct SuffixTreeNode {
 /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
 class SuffixTree {
 public:
-  /// Stores each leaf node in the tree.
-  ///
-  /// This is used for finding outlining candidates.
-  std::vector<SuffixTreeNode *> LeafVector;
-
   /// Each element is an integer representing an instruction in the module.
   ArrayRef<unsigned> Str;
 
+  /// A repeated substring in the tree.
+  struct RepeatedSubstring {
+    /// The length of the string.
+    unsigned Length;
+
+    /// The start indices of each occurrence.
+    std::vector<unsigned> StartIndices;
+  };
+
 private:
   /// Maintains each node in the tree.
   SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
@@ -322,8 +326,7 @@ private:
   }
 
   /// Set the suffix indices of the leaves to the start indices of their
-  /// respective suffixes. Also stores each leaf in \p LeafVector at its
-  /// respective suffix index.
+  /// respective suffixes.
   ///
   /// \param[in] CurrNode The node currently being visited.
   /// \param CurrIdx The current index of the string being visited.
@@ -353,9 +356,6 @@ private:
       CurrNode.SuffixIdx = Str.size() - CurrIdx;
       assert(CurrNode.Parent && "CurrNode had no parent!");
       CurrNode.Parent->OccurrenceCount++;
-
-      // Store the leaf in the leaf vector for pruning later.
-      LeafVector[CurrNode.SuffixIdx] = &CurrNode;
     }
   }
 
@@ -489,6 +489,44 @@ private:
     return SuffixesToAdd;
   }
 
+  /// Helper function for findRepeatedSubstrings.
+  /// Traverses the suffix tree that finds all nodes associated with a repeated
+  /// substring. That is, all internal non-root nodes. If the given node has
+  /// more than one leaf child, store the repeated strings in Substrings.
+  void
+  findRepeatedSubstringsHelper(SuffixTreeNode &Curr,
+                               std::vector<RepeatedSubstring> &Substrings,
+                               const unsigned MinLength = 1) {
+  assert(!Curr.isLeaf() && "Visited a leaf?");
+  std::vector<SuffixTreeNode *> LeafChildren;
+  unsigned Length = Curr.ConcatLen;
+
+  for (auto &ChildPair : Curr.Children) {
+    if (!ChildPair.second->isLeaf())
+      findRepeatedSubstringsHelper(*ChildPair.second, Substrings, MinLength);
+    else if (Length >= MinLength)
+      LeafChildren.push_back(ChildPair.second);
+  }
+
+  // The root node never has repeats. Quit here.
+  if (Curr.isRoot())
+    return;
+
+  // If there are no occurrences of the minimum length, then quit.
+  if (LeafChildren.empty() || LeafChildren.size() < 2)
+    return;
+
+  // We have a node associated with a repeated substring. Store that in
+  // Substrings and move on.
+  RepeatedSubstring RS;
+  RS.Length = Length;
+
+  // Each occurrence starts at a suffix given by a leaf child.
+  for (SuffixTreeNode *Leaf : LeafChildren)
+    RS.StartIndices.push_back(Leaf->SuffixIdx);
+  Substrings.push_back(RS);
+}
+
 public:
   /// Construct a suffix tree from a sequence of unsigned integers.
   ///
@@ -497,7 +535,6 @@ public:
     Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
     Root->IsInTree = true;
     Active.Node = Root;
-    LeafVector = std::vector<SuffixTreeNode *>(Str.size());
 
     // Keep track of the number of suffixes we have to add of the current
     // prefix.
@@ -518,6 +555,15 @@ public:
     assert(Root && "Root node can't be nullptr!");
     setSuffixIndices(*Root, 0);
   }
+
+  /// Finds all repeated substrings with an optionally-provided minimum length
+  /// and stores them in \p Substrings.
+  /// If \p MinLength is provided, only return those with a given minimum
+  /// length.
+  void findRepeatedSubstrings(std::vector<RepeatedSubstring> &Substrings,
+                              const unsigned MinLength = 1) {
+    findRepeatedSubstringsHelper(*Root, Substrings, MinLength);
+  }
 };
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
@@ -925,80 +971,55 @@ unsigned MachineOutliner::findCandidates
   FunctionList.clear();
   unsigned MaxLen = 0;
 
-  // FIXME: Visit internal nodes instead of leaves.
-  for (SuffixTreeNode *Leaf : ST.LeafVector) {
-    assert(Leaf && "Leaves in LeafVector cannot be null!");
-    if (!Leaf->IsInTree)
-      continue;
-
-    assert(Leaf->Parent && "All leaves must have parents!");
-    SuffixTreeNode &Parent = *(Leaf->Parent);
+  // First, find dall of the repeated substrings in the tree of minimum length
+  // 2.
+  // FIXME: 2 is an approximation which isn't necessarily true for, say, X86.
+  // If we factor in instruction lengths, we need more information than this.
+  // FIXME: It'd be nice if we could just have a repeated substring iterator.
+  std::vector<SuffixTree::RepeatedSubstring> RepeatedSubstrings;
+  ST.findRepeatedSubstrings(RepeatedSubstrings, 2);
 
-    // If it doesn't appear enough, or we already outlined from it, skip it.
-    if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree)
-      continue;
-
-    // Figure out if this candidate is beneficial.
-    unsigned StringLen = Leaf->ConcatLen - (unsigned)Leaf->size();
-
-    // Too short to be beneficial; skip it.
-    // FIXME: This isn't necessarily true for, say, X86. If we factor in
-    // instruction lengths we need more information than this.
-    if (StringLen < 2)
-      continue;
-
-    // If this is a beneficial class of candidate, then every one is stored in
-    // this vector.
+  for (SuffixTree::RepeatedSubstring &RS : RepeatedSubstrings) {
     std::vector<Candidate> CandidatesForRepeatedSeq;
-
-    // Figure out the call overhead for each instance of the sequence.
-    for (auto &ChildPair : Parent.Children) {
-      SuffixTreeNode *M = ChildPair.second;
-
-      if (M && M->IsInTree && M->isLeaf()) {
-        // Never visit this leaf again.
-        M->IsInTree = false;
-        unsigned StartIdx = M->SuffixIdx;
-        unsigned EndIdx = StartIdx + StringLen - 1;
-
-        // Trick: Discard some candidates that would be incompatible with the
-        // ones we've already found for this sequence. This will save us some
-        // work in candidate selection.
-        //
-        // If two candidates overlap, then we can't outline them both. This
-        // happens when we have candidates that look like, say
-        //
-        // AA (where each "A" is an instruction).
-        //
-        // We might have some portion of the module that looks like this:
-        // AAAAAA (6 A's)
-        //
-        // In this case, there are 5 different copies of "AA" in this range, but
-        // at most 3 can be outlined. If only outlining 3 of these is going to
-        // be unbeneficial, then we ought to not bother.
-        //
-        // Note that two things DON'T overlap when they look like this:
-        // start1...end1 .... start2...end2
-        // That is, one must either
-        // * End before the other starts
-        // * Start after the other ends
-        if (std::all_of(CandidatesForRepeatedSeq.begin(),
-                        CandidatesForRepeatedSeq.end(),
-                        [&StartIdx, &EndIdx](const Candidate &C) {
-                          return (EndIdx < C.getStartIdx() ||
-                                  StartIdx > C.getEndIdx());
-                        })) {
-          // It doesn't overlap with anything, so we can outline it.
-          // Each sequence is over [StartIt, EndIt].
-          // Save the candidate and its location.
-
-          MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx];
-          MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
-
-          CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt,
-                                                EndIt, StartIt->getParent(),
-                                                FunctionList.size());
-        }
+    unsigned StringLen = RS.Length;
+    for (const unsigned &StartIdx : RS.StartIndices) {
+      unsigned EndIdx = StartIdx + StringLen - 1;
+      // Trick: Discard some candidates that would be incompatible with the
+      // ones we've already found for this sequence. This will save us some
+      // work in candidate selection.
+      //
+      // If two candidates overlap, then we can't outline them both. This
+      // happens when we have candidates that look like, say
+      //
+      // AA (where each "A" is an instruction).
+      //
+      // We might have some portion of the module that looks like this:
+      // AAAAAA (6 A's)
+      //
+      // In this case, there are 5 different copies of "AA" in this range, but
+      // at most 3 can be outlined. If only outlining 3 of these is going to
+      // be unbeneficial, then we ought to not bother.
+      //
+      // Note that two things DON'T overlap when they look like this:
+      // start1...end1 .... start2...end2
+      // That is, one must either
+      // * End before the other starts
+      // * Start after the other ends
+      if (std::all_of(
+              CandidatesForRepeatedSeq.begin(), CandidatesForRepeatedSeq.end(),
+              [&StartIdx, &EndIdx](const Candidate &C) {
+                return (EndIdx < C.getStartIdx() || StartIdx > C.getEndIdx());
+              })) {
+        // It doesn't overlap with anything, so we can outline it.
+        // Each sequence is over [StartIt, EndIt].
+        // Save the candidate and its location.
+
+        MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx];
+        MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
+
+        CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt,
+                                              EndIt, StartIt->getParent(),
+                                              FunctionList.size());
       }
     }
 
@@ -1021,7 +1042,8 @@ unsigned MachineOutliner::findCandidates
       continue;
 
     std::vector<unsigned> Seq;
-    for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++)
+    unsigned StartIdx = RS.StartIndices[0]; // Grab any start index.
+    for (unsigned i = StartIdx; i < StartIdx + StringLen; i++)
       Seq.push_back(ST.Str[i]);
     OF.Sequence = Seq;
     OF.Name = FunctionList.size();
@@ -1040,9 +1062,6 @@ unsigned MachineOutliner::findCandidates
     for (std::shared_ptr<Candidate> &C : OF.Candidates)
       CandidateList.push_back(C);
     FunctionList.push_back(OF);
-
-    // Move to the next function.
-    Parent.IsInTree = false;
   }
 
   return MaxLen;




More information about the llvm-commits mailing list