[llvm] c2f0c20 - [NFC] Refactor SuffixTree to use LLVM-style RTTI

Jessica Paquette via llvm-commits llvm-commits at lists.llvm.org
Thu May 11 17:51:39 PDT 2023


Author: Jessica Paquette
Date: 2023-05-11T17:50:21-07:00
New Revision: c2f0c204d1847fac9f8d47c06a40cecd717a546d

URL: https://github.com/llvm/llvm-project/commit/c2f0c204d1847fac9f8d47c06a40cecd717a546d
DIFF: https://github.com/llvm/llvm-project/commit/c2f0c204d1847fac9f8d47c06a40cecd717a546d.diff

LOG: [NFC] Refactor SuffixTree to use LLVM-style RTTI

Following guidelines in

https://llvm.org/docs/HowToSetUpLLVMStyleRTTI.html

This allows us to

* Quickly discern between leaf and internal nodes
* Be more idiomatic with the rest of LLVM
* Save some size on node structs
* Reduce the number of allocations (because end indices for internal nodes no
  longer need to be pointers to be compatible with leaf nodes)

Also object orientify the code some more. This allows for more asserts and
checks.

This shouldn't impact code size on the MachineOutliner.

- All unit tests pass (outliner lit + llvm-unit)
- No code size changes on CTMark @ -Oz for AArch64

Added: 
    

Modified: 
    llvm/include/llvm/Support/SuffixTree.h
    llvm/lib/Support/SuffixTree.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h
index 4fb8da51d56bb..e8aa9d4f25460 100644
--- a/llvm/include/llvm/Support/SuffixTree.h
+++ b/llvm/include/llvm/Support/SuffixTree.h
@@ -15,6 +15,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include <vector>
 
 namespace llvm {
@@ -38,32 +39,53 @@ const unsigned EmptyIdx = -1;
 /// in \p Link. Each leaf node stores the start index of its respective
 /// suffix in \p SuffixIdx.
 struct SuffixTreeNode {
+public:
+  enum class NodeKind { ST_Leaf, ST_Internal };
 
-  /// The children of this node.
-  ///
-  /// A child existing on an unsigned integer implies that from the mapping
-  /// represented by the current node, there is a way to reach another
-  /// mapping by tacking that character on the end of the current string.
-  DenseMap<unsigned, SuffixTreeNode *> Children;
-
+private:
+  const NodeKind Kind;
   /// The start index of this node's substring in the main string.
   unsigned StartIdx = EmptyIdx;
 
+  /// The length of the string formed by concatenating the edge labels from
+  /// the root to this node.
+  unsigned ConcatLen = 0;
+
+public:
+  NodeKind getKind() const { return Kind; }
+
+  /// \return the start index of this node's substring in the entire string.
+  virtual unsigned getStartIdx() const { return StartIdx; }
+
+  /// \returns the end index of this node.
+  virtual unsigned getEndIdx() const = 0;
+
+  /// Advance this node's StartIdx by \p Inc.
+  void incrementStartIdx(unsigned Inc) { StartIdx += Inc; }
+
+  /// Set the length of the string from the root to this node to \p Len.
+  void setConcatLen(unsigned Len) { ConcatLen = Len; }
+
+  /// \returns the length of the string from the root to this node.
+  unsigned getConcatLen() const { return ConcatLen; }
+
+  SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
+      : Kind(Kind), StartIdx(StartIdx) {}
+  virtual ~SuffixTreeNode() = default;
+};
+
+struct SuffixTreeInternalNode : SuffixTreeNode {
+private:
   /// The end index of this node's substring in the main string.
   ///
   /// Every leaf node must have its \p EndIdx incremented at the end of every
   /// step in the construction algorithm. To avoid having to update O(N)
   /// nodes individually at the end of every step, the end index is stored
   /// as a pointer.
-  unsigned *EndIdx = nullptr;
+  unsigned EndIdx = EmptyIdx;
 
-  /// For leaves, the start index of the suffix represented by this node.
-  ///
-  /// For all other nodes, this is ignored.
-  unsigned SuffixIdx = EmptyIdx;
-
-  /// For internal nodes, a pointer to the internal node representing
-  /// the same sequence with the first character chopped off.
+  /// A pointer to the internal node representing the same sequence with the
+  /// first character chopped off.
   ///
   /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
   /// Ukkonen's algorithm does to achieve linear-time construction is
@@ -80,36 +102,77 @@ struct SuffixTreeNode {
   /// move to the next insertion point in O(1) time. If we don't, then we'd
   /// have to query from the root, which takes O(N) time. This would make the
   /// construction algorithm O(N^2) rather than O(N).
-  SuffixTreeNode *Link = nullptr;
+  SuffixTreeInternalNode *Link = nullptr;
 
-  /// The length of the string formed by concatenating the edge labels from the
-  /// root to this node.
-  unsigned ConcatLen = 0;
+public:
+  static bool classof(const SuffixTreeNode *N) {
+    return N->getKind() == NodeKind::ST_Internal;
+  }
+
+  /// \returns true if this node is the root of its owning \p SuffixTree.
+  bool isRoot() const { return getStartIdx() == EmptyIdx; }
 
-  /// Returns true if this node is a leaf.
-  bool isLeaf() const { return SuffixIdx != EmptyIdx; }
+  /// \returns the end index of this node's substring in the entire string.
+  unsigned getEndIdx() const override { return EndIdx; }
 
-  /// Returns true if this node is the root of its owning \p SuffixTree.
-  bool isRoot() const { return StartIdx == EmptyIdx; }
+  /// Sets \p Link to \p L. Assumes \p L is not null.
+  void setLink(SuffixTreeInternalNode *L) {
+    assert(L && "Cannot set a null link?");
+    Link = L;
+  }
 
-  /// Return the number of elements in the substring associated with this node.
-  size_t size() const {
+  /// \returns the pointer to the Link node.
+  SuffixTreeInternalNode *getLink() const {
+    return Link;
+  }
 
-    // Is it the root? If so, it's the empty string so return 0.
-    if (isRoot())
-      return 0;
+  /// The children of this node.
+  ///
+  /// A child existing on an unsigned integer implies that from the mapping
+  /// represented by the current node, there is a way to reach another
+  /// mapping by tacking that character on the end of the current string.
+  DenseMap<unsigned, SuffixTreeNode *> Children;
 
-    assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
+  SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
+                         SuffixTreeInternalNode *Link)
+      : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
+        Link(Link) {}
 
-    // Size = the number of elements in the string.
-    // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
-    return *EndIdx - StartIdx + 1;
+  virtual ~SuffixTreeInternalNode() = default;
+};
+
+struct SuffixTreeLeafNode : SuffixTreeNode {
+private:
+  /// The start index of the suffix represented by this leaf.
+  unsigned SuffixIdx = EmptyIdx;
+
+  /// The end index of this node's substring in the main string.
+  ///
+  /// Every leaf node must have its \p EndIdx incremented at the end of every
+  /// step in the construction algorithm. To avoid having to update O(N)
+  /// nodes individually at the end of every step, the end index is stored
+  /// as a pointer.
+  unsigned *EndIdx = nullptr;
+
+public:
+  static bool classof(const SuffixTreeNode *N) {
+    return N->getKind() == NodeKind::ST_Leaf;
   }
 
-  SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
-      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
+  /// \returns the end index of this node's substring in the entire string.
+  unsigned getEndIdx() const override {
+    assert(EndIdx && "EndIdx is empty?");
+    return *EndIdx;
+  }
+
+  /// \returns the start index of the suffix represented by this leaf.
+  unsigned getSuffixIdx() const { return SuffixIdx; }
+  /// Sets the start index of the suffix represented by this leaf to \p Idx.
+  void setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; }
+  SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
+      : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
 
-  SuffixTreeNode() = default;
+  virtual ~SuffixTreeLeafNode() = default;
 };
 
 /// A data structure for fast substring queries.
@@ -149,23 +212,16 @@ class SuffixTree {
   };
 
 private:
-  /// Maintains each node in the tree.
-  SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
+  /// Maintains internal nodes in the tree.
+  SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator;
+  /// Maintains leaf nodes in the tree.
+  SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator;
 
   /// The root of the suffix tree.
   ///
   /// The root represents the empty string. It is maintained by the
   /// \p NodeAllocator like every other node in the tree.
-  SuffixTreeNode *Root = nullptr;
-
-  /// Maintains the end indices of the internal nodes in the tree.
-  ///
-  /// Each internal node is guaranteed to never have its end index change
-  /// during the construction algorithm; however, leaves must be updated at
-  /// every step. Therefore, we need to store leaf end indices by reference
-  /// to avoid updating O(N) leaves at every step of construction. Thus,
-  /// every internal node must be allocated its own end index.
-  BumpPtrAllocator InternalEndIdxAllocator;
+  SuffixTreeInternalNode *Root = nullptr;
 
   /// The end index of each leaf in the tree.
   unsigned LeafEndIdx = -1;
@@ -174,7 +230,7 @@ class SuffixTree {
   /// Ukkonen's algorithm.
   struct ActiveState {
     /// The next node to insert at.
-    SuffixTreeNode *Node = nullptr;
+    SuffixTreeInternalNode *Node = nullptr;
 
     /// The index of the first character in the substring currently being added.
     unsigned Idx = EmptyIdx;
@@ -194,7 +250,7 @@ class SuffixTree {
   /// \param Edge The label on the edge leaving \p Parent to this node.
   ///
   /// \returns A pointer to the allocated leaf node.
-  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
+  SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx,
                              unsigned Edge);
 
   /// Allocate an internal node and add it to the tree.
@@ -205,8 +261,9 @@ class SuffixTree {
   /// \param Edge The label on the edge leaving \p Parent to this node.
   ///
   /// \returns A pointer to the allocated internal node.
-  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
-                                     unsigned EndIdx, unsigned Edge);
+  SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent,
+                                             unsigned StartIdx, unsigned EndIdx,
+                                             unsigned Edge);
 
   /// Set the suffix indices of the leaves to the start indices of their
   /// respective suffixes.
@@ -244,7 +301,7 @@ class SuffixTree {
     RepeatedSubstring RS;
 
     /// The nodes left to visit.
-    SmallVector<SuffixTreeNode *> ToVisit;
+    SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit;
 
     /// The minimum length of a repeated substring to find.
     /// Since we're outlining, we want at least two instructions in the range.
@@ -260,30 +317,31 @@ class SuffixTree {
       N = nullptr;
 
       // Each leaf node represents a repeat of a string.
-      SmallVector<SuffixTreeNode *> LeafChildren;
+      SmallVector<SuffixTreeLeafNode *> LeafChildren;
 
       // Continue visiting nodes until we find one which repeats more than once.
-      while (!ToVisit.empty()) {
-        SuffixTreeNode *Curr = ToVisit.back();
-        ToVisit.pop_back();
+      while (!InternalNodesToVisit.empty()) {
         LeafChildren.clear();
+        auto *Curr = InternalNodesToVisit.back();
+        InternalNodesToVisit.pop_back();
 
         // Keep track of the length of the string associated with the node. If
         // it's too short, we'll quit.
-        unsigned Length = Curr->ConcatLen;
+        unsigned Length = Curr->getConcatLen();
 
         // Iterate over each child, saving internal nodes for visiting, and
         // leaf nodes in LeafChildren. Internal nodes represent individual
         // strings, which may repeat.
         for (auto &ChildPair : Curr->Children) {
           // Save all of this node's children for processing.
-          if (!ChildPair.second->isLeaf())
-            ToVisit.push_back(ChildPair.second);
+          if (auto *InternalChild =
+                  dyn_cast<SuffixTreeInternalNode>(ChildPair.second))
+            InternalNodesToVisit.push_back(InternalChild);
 
           // It's not an internal node, so it must be a leaf. If we have a
           // long enough string, then save the leaf children.
           else if (Length >= MinLength)
-            LeafChildren.push_back(ChildPair.second);
+            LeafChildren.push_back(cast<SuffixTreeLeafNode>(ChildPair.second));
         }
 
         // The root never represents a repeated substring. If we're looking at
@@ -296,12 +354,11 @@ class SuffixTree {
           // Yes. Update the state to reflect this, and then bail out.
           N = Curr;
           RS.Length = Length;
-          for (SuffixTreeNode *Leaf : LeafChildren)
-            RS.StartIndices.push_back(Leaf->SuffixIdx);
+          for (SuffixTreeLeafNode *Leaf : LeafChildren)
+            RS.StartIndices.push_back(Leaf->getSuffixIdx());
           break;
         }
       }
-
       // At this point, either NewRS is an empty RepeatedSubstring, or it was
       // set in the above loop. Similarly, N is either nullptr, or the node
       // associated with NewRS.
@@ -329,14 +386,14 @@ class SuffixTree {
       return !(*this == Other);
     }
 
-    RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
+    RepeatedSubstringIterator(SuffixTreeInternalNode *N) : N(N) {
       // Do we have a non-null node?
-      if (N) {
-        // Yes. At the first step, we need to visit all of N's children.
-        // Note: This means that we visit N last.
-        ToVisit.push_back(N);
-        advance();
-      }
+      if (!N)
+        return;
+      // Yes. At the first step, we need to visit all of N's children.
+      // Note: This means that we visit N last.
+      InternalNodesToVisit.push_back(N);
+      advance();
     }
   };
 

diff  --git a/llvm/lib/Support/SuffixTree.cpp b/llvm/lib/Support/SuffixTree.cpp
index e2cd7b6073ac8..689849ae9168e 100644
--- a/llvm/lib/Support/SuffixTree.cpp
+++ b/llvm/lib/Support/SuffixTree.cpp
@@ -12,10 +12,18 @@
 
 #include "llvm/Support/SuffixTree.h"
 #include "llvm/Support/Allocator.h"
-#include <vector>
 
 using namespace llvm;
 
+/// \returns the number of elements in the substring associated with \p N.
+static size_t numElementsInSubstring(const SuffixTreeNode *N) {
+  assert(N && "Got a null node?");
+  if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
+    if (Internal->isRoot())
+      return 0;
+  return N->getEndIdx() - N->getStartIdx() + 1;
+}
+
 SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str) : Str(Str) {
   Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
   Active.Node = Root;
@@ -38,32 +46,26 @@ SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str) : Str(Str) {
   setSuffixIndices();
 }
 
-SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
+SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
                                        unsigned StartIdx, unsigned Edge) {
-
   assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
-
-  SuffixTreeNode *N = new (NodeAllocator.Allocate())
-      SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
+  auto *N = new (LeafNodeAllocator.Allocate())
+      SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
   Parent.Children[Edge] = N;
-
   return N;
 }
 
-SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
-                                               unsigned StartIdx,
-                                               unsigned EndIdx, unsigned Edge) {
-
+SuffixTreeInternalNode *
+SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
+                               unsigned StartIdx, unsigned EndIdx,
+                               unsigned Edge) {
   assert(StartIdx <= EndIdx && "String can't start after it ends!");
   assert(!(!Parent && StartIdx != EmptyIdx) &&
          "Non-root internal nodes must have parents!");
-
-  unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
-  SuffixTreeNode *N =
-      new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
+  auto *N = new (InternalNodeAllocator.Allocate())
+      SuffixTreeInternalNode(StartIdx, EndIdx, Root);
   if (Parent)
     Parent->Children[Edge] = N;
-
   return N;
 }
 
@@ -81,21 +83,23 @@ void SuffixTree::setSuffixIndices() {
   while (!ToVisit.empty()) {
     std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
     ToVisit.pop_back();
-    CurrNode->ConcatLen = CurrNodeLen;
-    for (auto &ChildPair : CurrNode->Children) {
-      assert(ChildPair.second && "Node had a null child!");
-      ToVisit.push_back(
-          {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
-    }
-
+    // Length of the current node from the root down to here.
+    CurrNode->setConcatLen(CurrNodeLen);
+    if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
+      for (auto &ChildPair : InternalNode->Children) {
+        assert(ChildPair.second && "Node had a null child!");
+        ToVisit.push_back(
+            {ChildPair.second,
+             CurrNodeLen + numElementsInSubstring(ChildPair.second)});
+      }
     // No children, so we are at the end of the string.
-    if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
-      CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
+    if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
+      LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
   }
 }
 
 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
-  SuffixTreeNode *NeedsLink = nullptr;
+  SuffixTreeInternalNode *NeedsLink = nullptr;
 
   while (SuffixesToAdd > 0) {
 
@@ -118,7 +122,7 @@ unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
       // The active node is an internal node, and we visited it, so it must
       // need a link if it doesn't have one.
       if (NeedsLink) {
-        NeedsLink->Link = Active.Node;
+        NeedsLink->setLink(Active.Node);
         NeedsLink = nullptr;
       }
     } else {
@@ -126,16 +130,18 @@ unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
       // insert a new node.
       SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
 
-      unsigned SubstringLen = NextNode->size();
+      unsigned SubstringLen = numElementsInSubstring(NextNode);
 
       // Is the current suffix we're trying to insert longer than the size of
       // the child we want to move to?
       if (Active.Len >= SubstringLen) {
         // If yes, then consume the characters we've seen and move to the next
         // node.
+        assert(isa<SuffixTreeInternalNode>(NextNode) &&
+               "Expected an internal node?");
         Active.Idx += SubstringLen;
         Active.Len -= SubstringLen;
-        Active.Node = NextNode;
+        Active.Node = cast<SuffixTreeInternalNode>(NextNode);
         continue;
       }
 
@@ -144,12 +150,12 @@ unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
       unsigned LastChar = Str[EndIdx];
 
       // Is the string we're trying to insert a substring of the next node?
-      if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
+      if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
         // If yes, then we're done for this step. Remember our insertion point
         // and move to the next end index. At this point, we have an implicit
         // suffix tree.
         if (NeedsLink && !Active.Node->isRoot()) {
-          NeedsLink->Link = Active.Node;
+          NeedsLink->setLink(Active.Node);
           NeedsLink = nullptr;
         }
 
@@ -171,9 +177,9 @@ unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
       //                      n   l
 
       // The node s from the diagram
-      SuffixTreeNode *SplitNode =
-          insertInternalNode(Active.Node, NextNode->StartIdx,
-                             NextNode->StartIdx + Active.Len - 1, FirstChar);
+      SuffixTreeInternalNode *SplitNode =
+          insertInternalNode(Active.Node, NextNode->getStartIdx(),
+                             NextNode->getStartIdx() + Active.Len - 1, FirstChar);
 
       // Insert the new node representing the new substring into the tree as
       // a child of the split node. This is the node l from the diagram.
@@ -181,12 +187,12 @@ unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
 
       // Make the old node a child of the split node and update its start
       // index. This is the node n from the diagram.
-      NextNode->StartIdx += Active.Len;
-      SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
+      NextNode->incrementStartIdx(Active.Len);
+      SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
 
       // SplitNode is an internal node, update the suffix link.
       if (NeedsLink)
-        NeedsLink->Link = SplitNode;
+        NeedsLink->setLink(SplitNode);
 
       NeedsLink = SplitNode;
     }
@@ -202,7 +208,7 @@ unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
       }
     } else {
       // Start the next phase at the next smallest suffix.
-      Active.Node = Active.Node->Link;
+      Active.Node = Active.Node->getLink();
     }
   }
 


        


More information about the llvm-commits mailing list