[llvm] r346345 - [MachineOutliner][NFC] Traverse suffix tree using a RepeatedSubstring iterator

Jessica Paquette via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 7 11:20:55 PST 2018


Author: paquette
Date: Wed Nov  7 11:20:55 2018
New Revision: 346345

URL: http://llvm.org/viewvc/llvm-project?rev=346345&view=rev
Log:
[MachineOutliner][NFC] Traverse suffix tree using a RepeatedSubstring iterator

This takes the traversal methods introduced in r346269 and adapts them
into an iterator. This allows the outliner to iterate over repeated substrings
within the suffix tree directly without having to initially find all of the
substrings and then iterate over them after you've found them.

Modified:
    llvm/trunk/lib/CodeGen/MachineOutliner.cpp

Modified: llvm/trunk/lib/CodeGen/MachineOutliner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineOutliner.cpp?rev=346345&r1=346344&r2=346345&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/MachineOutliner.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineOutliner.cpp Wed Nov  7 11:20:55 2018
@@ -479,44 +479,6 @@ private:
     return SuffixesToAdd;
   }
 
-  /// Helper function for findRepeatedSubstrings.
-  /// Traverses the suffix tree that finds all nodes associated with a repeated
-  /// substring. That is, all internal non-root nodes. If the given node has
-  /// more than one leaf child, store the repeated strings in Substrings.
-  void
-  findRepeatedSubstringsHelper(SuffixTreeNode &Curr,
-                               std::vector<RepeatedSubstring> &Substrings,
-                               const unsigned MinLength = 1) {
-  assert(!Curr.isLeaf() && "Visited a leaf?");
-  std::vector<SuffixTreeNode *> LeafChildren;
-  unsigned Length = Curr.ConcatLen;
-
-  for (auto &ChildPair : Curr.Children) {
-    if (!ChildPair.second->isLeaf())
-      findRepeatedSubstringsHelper(*ChildPair.second, Substrings, MinLength);
-    else if (Length >= MinLength)
-      LeafChildren.push_back(ChildPair.second);
-  }
-
-  // The root node never has repeats. Quit here.
-  if (Curr.isRoot())
-    return;
-
-  // If there are no occurrences of the minimum length, then quit.
-  if (LeafChildren.empty() || LeafChildren.size() < 2)
-    return;
-
-  // We have a node associated with a repeated substring. Store that in
-  // Substrings and move on.
-  RepeatedSubstring RS;
-  RS.Length = Length;
-
-  // Each occurrence starts at a suffix given by a leaf child.
-  for (SuffixTreeNode *Leaf : LeafChildren)
-    RS.StartIndices.push_back(Leaf->SuffixIdx);
-  Substrings.push_back(RS);
-}
-
 public:
   /// Construct a suffix tree from a sequence of unsigned integers.
   ///
@@ -545,14 +507,115 @@ public:
     setSuffixIndices(*Root, 0);
   }
 
-  /// Finds all repeated substrings with an optionally-provided minimum length
-  /// and stores them in \p Substrings.
-  /// If \p MinLength is provided, only return those with a given minimum
-  /// length.
-  void findRepeatedSubstrings(std::vector<RepeatedSubstring> &Substrings,
-                              const unsigned MinLength = 1) {
-    findRepeatedSubstringsHelper(*Root, Substrings, MinLength);
-  }
+
+  /// Iterator for finding all repeated substrings in the suffix tree.
+  struct RepeatedSubstringIterator {
+    private:
+    /// The current node we're visiting.
+    SuffixTreeNode *N = nullptr;
+
+    /// The repeated substring associated with this node.
+    RepeatedSubstring RS;
+
+    /// The nodes left to visit.
+    std::vector<SuffixTreeNode *> ToVisit;
+
+    /// The minimum length of a repeated substring to find.
+    /// Since we're outlining, we want at least two instructions in the range.
+    /// FIXME: This may not be true for targets like X86 which support many
+    /// instruction lengths.
+    const unsigned MinLength = 2;
+
+    /// Move the iterator to the next repeated substring.
+    void advance() {
+      // Clear the current state. If we're at the end of the range, then this
+      // is the state we want to be in.
+      RS = RepeatedSubstring();
+      N = nullptr;
+
+      // Continue visiting nodes until we find one which repeats more than once.
+      while (!ToVisit.empty()) {
+        SuffixTreeNode *Curr = ToVisit.back();
+        ToVisit.pop_back();
+
+        // Keep track of the length of the string associated with the node. If
+        // it's too short, we'll quit.
+        unsigned Length = Curr->ConcatLen;
+
+        // Each leaf node represents a repeat of a string.
+        std::vector<SuffixTreeNode *> LeafChildren;
+
+        // Iterate over each child, saving internal nodes for visiting, and
+        // leaf nodes in LeafChildren. Internal nodes represent individual
+        // strings, which may repeat.
+        for (auto &ChildPair : Curr->Children) {
+          // Save all of this node's children for processing.
+          if (!ChildPair.second->isLeaf())
+            ToVisit.push_back(ChildPair.second);
+
+          // It's not an internal node, so it must be a leaf. If we have a
+          // long enough string, then save the leaf children.
+          else if (Length >= MinLength)
+            LeafChildren.push_back(ChildPair.second);
+        }
+
+        // The root never represents a repeated substring. If we're looking at
+        // that, then skip it.
+        if (Curr->isRoot())
+          continue;
+
+        // Do we have any repeated substrings?
+        if (LeafChildren.size() >= 2) {
+          // Yes. Update the state to reflect this, and then bail out.
+          N = Curr;
+          RS.Length = Length;
+          for (SuffixTreeNode *Leaf : LeafChildren)
+            RS.StartIndices.push_back(Leaf->SuffixIdx);
+          break;
+        }
+      }
+
+      // At this point, either NewRS is an empty RepeatedSubstring, or it was
+      // set in the above loop. Similarly, N is either nullptr, or the node
+      // associated with NewRS.
+    }
+
+  public:
+    /// Return the current repeated substring.
+    RepeatedSubstring &operator*() { return RS; }
+
+    RepeatedSubstringIterator &operator++() {
+      advance();
+      return *this;
+    }
+
+    RepeatedSubstringIterator operator++(int I) {
+      RepeatedSubstringIterator It(*this);
+      advance();
+      return It;
+    }
+
+    bool operator==(const RepeatedSubstringIterator &Other) {
+      return N == Other.N;
+    }
+    bool operator!=(const RepeatedSubstringIterator &Other) {
+      return !(*this == Other);
+    }
+
+    RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
+      // Do we have a non-null node?
+      if (N) {
+        // Yes. At the first step, we need to visit all of N's children.
+        // Note: This means that we visit N last.
+        ToVisit.push_back(N);
+        advance();
+      }
+    }
+};
+
+  typedef RepeatedSubstringIterator iterator;
+  iterator begin() { return iterator(Root); }
+  iterator end() { return iterator(nullptr); }
 };
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
@@ -963,13 +1026,8 @@ unsigned MachineOutliner::findCandidates
 
   // First, find dall of the repeated substrings in the tree of minimum length
   // 2.
-  // FIXME: 2 is an approximation which isn't necessarily true for, say, X86.
-  // If we factor in instruction lengths, we need more information than this.
-  // FIXME: It'd be nice if we could just have a repeated substring iterator.
-  std::vector<SuffixTree::RepeatedSubstring> RepeatedSubstrings;
-  ST.findRepeatedSubstrings(RepeatedSubstrings, 2);
-
-  for (SuffixTree::RepeatedSubstring &RS : RepeatedSubstrings) {
+  for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
+    SuffixTree::RepeatedSubstring RS = *It;
     std::vector<Candidate> CandidatesForRepeatedSeq;
     unsigned StringLen = RS.Length;
     for (const unsigned &StartIdx : RS.StartIndices) {




More information about the llvm-commits mailing list