[llvm] c2eeaf1 - [NFC] SuffixTree: Move advance() into SuffixTree.cpp + more cleanup

Jessica Paquette via llvm-commits llvm-commits at lists.llvm.org
Thu May 11 22:30:07 PDT 2023


Author: Jessica Paquette
Date: 2023-05-11T22:29:47-07:00
New Revision: c2eeaf105a45fe26c302418b7c835161aedba6c2

URL: https://github.com/llvm/llvm-project/commit/c2eeaf105a45fe26c302418b7c835161aedba6c2
DIFF: https://github.com/llvm/llvm-project/commit/c2eeaf105a45fe26c302418b7c835161aedba6c2.diff

LOG: [NFC] SuffixTree: Move advance() into SuffixTree.cpp + more cleanup

Allows us to knock out a couple more includes from the header file.

Also clang-format SuffixTree.cpp while we're here.

Also use SuffixTreeNode::EmptyIdx in a couple more places.

Added: 
    

Modified: 
    llvm/include/llvm/Support/SuffixTree.h
    llvm/lib/Support/SuffixTree.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h
index aff6d9bfded1..4940fbbf308d 100644
--- a/llvm/include/llvm/Support/SuffixTree.h
+++ b/llvm/include/llvm/Support/SuffixTree.h
@@ -34,9 +34,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/SuffixTreeNode.h"
-#include <vector>
 
 namespace llvm {
 class SuffixTree {
@@ -66,7 +64,7 @@ class SuffixTree {
   SuffixTreeInternalNode *Root = nullptr;
 
   /// The end index of each leaf in the tree.
-  unsigned LeafEndIdx = -1;
+  unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
 
   /// Helper struct which keeps track of the next insertion point in
   /// Ukkonen's algorithm.
@@ -157,64 +155,7 @@ class SuffixTree {
     const unsigned MinLength = 2;
 
     /// Move the iterator to the next repeated substring.
-    void advance() {
-      // Clear the current state. If we're at the end of the range, then this
-      // is the state we want to be in.
-      RS = RepeatedSubstring();
-      N = nullptr;
-
-      // Each leaf node represents a repeat of a string.
-      SmallVector<unsigned> RepeatedSubstringStarts;
-
-      // Continue visiting nodes until we find one which repeats more than once.
-      while (!InternalNodesToVisit.empty()) {
-        RepeatedSubstringStarts.clear();
-        auto *Curr = InternalNodesToVisit.back();
-        InternalNodesToVisit.pop_back();
-
-        // Keep track of the length of the string associated with the node. If
-        // it's too short, we'll quit.
-        unsigned Length = Curr->getConcatLen();
-
-        // Iterate over each child, saving internal nodes for visiting, and
-        // leaf nodes in LeafChildren. Internal nodes represent individual
-        // strings, which may repeat.
-        for (auto &ChildPair : Curr->Children) {
-          // Save all of this node's children for processing.
-          if (auto *InternalChild =
-                  dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) {
-            InternalNodesToVisit.push_back(InternalChild);
-            continue;
-          }
-
-          if (Length < MinLength)
-            continue;
-
-          // Have an occurrence of a potentially repeated string. Save it.
-          auto *Leaf = cast<SuffixTreeLeafNode>(ChildPair.second);
-          RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
-        }
-
-        // The root never represents a repeated substring. If we're looking at
-        // that, then skip it.
-        if (Curr->isRoot())
-          continue;
-
-        // Do we have any repeated substrings?
-        if (RepeatedSubstringStarts.size() < 2)
-          continue;
-
-        // Yes. Update the state to reflect this, and then bail out.
-        N = Curr;
-        RS.Length = Length;
-        for (unsigned StartIdx : RepeatedSubstringStarts)
-          RS.StartIndices.push_back(StartIdx);
-        break;
-      }
-      // At this point, either NewRS is an empty RepeatedSubstring, or it was
-      // set in the above loop. Similarly, N is either nullptr, or the node
-      // associated with NewRS.
-    }
+    void advance();
 
   public:
     /// Return the current repeated substring.

diff  --git a/llvm/lib/Support/SuffixTree.cpp b/llvm/lib/Support/SuffixTree.cpp
index d94e3f2c722e..eaa653078e09 100644
--- a/llvm/lib/Support/SuffixTree.cpp
+++ b/llvm/lib/Support/SuffixTree.cpp
@@ -12,6 +12,8 @@
 
 #include "llvm/Support/SuffixTree.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/SuffixTreeNode.h"
 
 using namespace llvm;
 
@@ -182,9 +184,9 @@ unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
       //                      n   l
 
       // The node s from the diagram
-      SuffixTreeInternalNode *SplitNode =
-          insertInternalNode(Active.Node, NextNode->getStartIdx(),
-                             NextNode->getStartIdx() + Active.Len - 1, FirstChar);
+      SuffixTreeInternalNode *SplitNode = insertInternalNode(
+          Active.Node, NextNode->getStartIdx(),
+          NextNode->getStartIdx() + Active.Len - 1, FirstChar);
 
       // Insert the new node representing the new substring into the tree as
       // a child of the split node. This is the node l from the diagram.
@@ -219,3 +221,62 @@ unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
 
   return SuffixesToAdd;
 }
+
+void SuffixTree::RepeatedSubstringIterator::advance() {
+  // Clear the current state. If we're at the end of the range, then this
+  // is the state we want to be in.
+  RS = RepeatedSubstring();
+  N = nullptr;
+
+  // Each leaf node represents a repeat of a string.
+  SmallVector<unsigned> RepeatedSubstringStarts;
+
+  // Continue visiting nodes until we find one which repeats more than once.
+  while (!InternalNodesToVisit.empty()) {
+    RepeatedSubstringStarts.clear();
+    auto *Curr = InternalNodesToVisit.back();
+    InternalNodesToVisit.pop_back();
+
+    // Keep track of the length of the string associated with the node. If
+    // it's too short, we'll quit.
+    unsigned Length = Curr->getConcatLen();
+
+    // Iterate over each child, saving internal nodes for visiting, and
+    // leaf nodes in LeafChildren. Internal nodes represent individual
+    // strings, which may repeat.
+    for (auto &ChildPair : Curr->Children) {
+      // Save all of this node's children for processing.
+      if (auto *InternalChild =
+              dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) {
+        InternalNodesToVisit.push_back(InternalChild);
+        continue;
+      }
+
+      if (Length < MinLength)
+        continue;
+
+      // Have an occurrence of a potentially repeated string. Save it.
+      auto *Leaf = cast<SuffixTreeLeafNode>(ChildPair.second);
+      RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
+    }
+
+    // The root never represents a repeated substring. If we're looking at
+    // that, then skip it.
+    if (Curr->isRoot())
+      continue;
+
+    // Do we have any repeated substrings?
+    if (RepeatedSubstringStarts.size() < 2)
+      continue;
+
+    // Yes. Update the state to reflect this, and then bail out.
+    N = Curr;
+    RS.Length = Length;
+    for (unsigned StartIdx : RepeatedSubstringStarts)
+      RS.StartIndices.push_back(StartIdx);
+    break;
+  }
+  // At this point, either NewRS is an empty RepeatedSubstring, or it was
+  // set in the above loop. Similarly, N is either nullptr, or the node
+  // associated with NewRS.
+}


        


More information about the llvm-commits mailing list