[llvm] 6cf993e - [NFC] SuffixTree: Split out SuffixTreeNodes into their own files

Jessica Paquette via llvm-commits llvm-commits at lists.llvm.org
Thu May 11 21:34:24 PDT 2023


Author: Jessica Paquette
Date: 2023-05-11T21:33:39-07:00
New Revision: 6cf993e59bd22197ebfa074235906a48971a6047

URL: https://github.com/llvm/llvm-project/commit/6cf993e59bd22197ebfa074235906a48971a6047
DIFF: https://github.com/llvm/llvm-project/commit/6cf993e59bd22197ebfa074235906a48971a6047.diff

LOG: [NFC] SuffixTree: Split out SuffixTreeNodes into their own files

Add:

- SuffixTreeNode.h
- SuffixTreeNode.cpp

The SuffixTree file was getting too long.

Added: 
    llvm/include/llvm/Support/SuffixTreeNode.h
    llvm/lib/Support/SuffixTreeNode.cpp

Modified: 
    llvm/include/llvm/Support/SuffixTree.h
    llvm/lib/Support/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h
index 372b83d1172a..189dd041bffe 100644
--- a/llvm/include/llvm/Support/SuffixTree.h
+++ b/llvm/include/llvm/Support/SuffixTree.h
@@ -13,167 +13,12 @@
 #define LLVM_SUPPORT_SUFFIXTREE_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/SuffixTreeNode.h"
 #include <vector>
 
 namespace llvm {
-
-/// A node in a suffix tree which represents a substring or suffix.
-///
-/// Each node has either no children or at least two children, with the root
-/// being a exception in the empty tree.
-///
-/// Children are represented as a map between unsigned integers and nodes. If
-/// a node N has a child M on unsigned integer k, then the mapping represented
-/// by N is a proper prefix of the mapping represented by M. Note that this,
-/// although similar to a trie is somewhat 
diff erent: each node stores a full
-/// substring of the full mapping rather than a single character state.
-///
-/// Each internal node contains a pointer to the internal node representing
-/// the same string, but with the first character chopped off. This is stored
-/// in \p Link. Each leaf node stores the start index of its respective
-/// suffix in \p SuffixIdx.
-struct SuffixTreeNode {
-public:
-  /// Represents an undefined index in the suffix tree.
-  static const unsigned EmptyIdx = -1;
-  enum class NodeKind { ST_Leaf, ST_Internal };
-
-private:
-  const NodeKind Kind;
-  /// The start index of this node's substring in the main string.
-  unsigned StartIdx = EmptyIdx;
-
-  /// The length of the string formed by concatenating the edge labels from
-  /// the root to this node.
-  unsigned ConcatLen = 0;
-
-public:
-  NodeKind getKind() const { return Kind; }
-
-  /// \return the start index of this node's substring in the entire string.
-  virtual unsigned getStartIdx() const { return StartIdx; }
-
-  /// \returns the end index of this node.
-  virtual unsigned getEndIdx() const = 0;
-
-  /// Advance this node's StartIdx by \p Inc.
-  void incrementStartIdx(unsigned Inc) { StartIdx += Inc; }
-
-  /// Set the length of the string from the root to this node to \p Len.
-  void setConcatLen(unsigned Len) { ConcatLen = Len; }
-
-  /// \returns the length of the string from the root to this node.
-  unsigned getConcatLen() const { return ConcatLen; }
-
-  SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
-      : Kind(Kind), StartIdx(StartIdx) {}
-  virtual ~SuffixTreeNode() = default;
-};
-
-struct SuffixTreeInternalNode : SuffixTreeNode {
-private:
-  /// The end index of this node's substring in the main string.
-  ///
-  /// Every leaf node must have its \p EndIdx incremented at the end of every
-  /// step in the construction algorithm. To avoid having to update O(N)
-  /// nodes individually at the end of every step, the end index is stored
-  /// as a pointer.
-  unsigned EndIdx = EmptyIdx;
-
-  /// A pointer to the internal node representing the same sequence with the
-  /// first character chopped off.
-  ///
-  /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
-  /// Ukkonen's algorithm does to achieve linear-time construction is
-  /// keep track of which node the next insert should be at. This makes each
-  /// insert O(1), and there are a total of O(N) inserts. The suffix link
-  /// helps with inserting children of internal nodes.
-  ///
-  /// Say we add a child to an internal node with associated mapping S. The
-  /// next insertion must be at the node representing S - its first character.
-  /// This is given by the way that we iteratively build the tree in Ukkonen's
-  /// algorithm. The main idea is to look at the suffixes of each prefix in the
-  /// string, starting with the longest suffix of the prefix, and ending with
-  /// the shortest. Therefore, if we keep pointers between such nodes, we can
-  /// move to the next insertion point in O(1) time. If we don't, then we'd
-  /// have to query from the root, which takes O(N) time. This would make the
-  /// construction algorithm O(N^2) rather than O(N).
-  SuffixTreeInternalNode *Link = nullptr;
-
-public:
-  static bool classof(const SuffixTreeNode *N) {
-    return N->getKind() == NodeKind::ST_Internal;
-  }
-
-  /// \returns true if this node is the root of its owning \p SuffixTree.
-  bool isRoot() const { return getStartIdx() == EmptyIdx; }
-
-  /// \returns the end index of this node's substring in the entire string.
-  unsigned getEndIdx() const override { return EndIdx; }
-
-  /// Sets \p Link to \p L. Assumes \p L is not null.
-  void setLink(SuffixTreeInternalNode *L) {
-    assert(L && "Cannot set a null link?");
-    Link = L;
-  }
-
-  /// \returns the pointer to the Link node.
-  SuffixTreeInternalNode *getLink() const {
-    return Link;
-  }
-
-  /// The children of this node.
-  ///
-  /// A child existing on an unsigned integer implies that from the mapping
-  /// represented by the current node, there is a way to reach another
-  /// mapping by tacking that character on the end of the current string.
-  DenseMap<unsigned, SuffixTreeNode *> Children;
-
-  SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
-                         SuffixTreeInternalNode *Link)
-      : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
-        Link(Link) {}
-
-  virtual ~SuffixTreeInternalNode() = default;
-};
-
-struct SuffixTreeLeafNode : SuffixTreeNode {
-private:
-  /// The start index of the suffix represented by this leaf.
-  unsigned SuffixIdx = EmptyIdx;
-
-  /// The end index of this node's substring in the main string.
-  ///
-  /// Every leaf node must have its \p EndIdx incremented at the end of every
-  /// step in the construction algorithm. To avoid having to update O(N)
-  /// nodes individually at the end of every step, the end index is stored
-  /// as a pointer.
-  unsigned *EndIdx = nullptr;
-
-public:
-  static bool classof(const SuffixTreeNode *N) {
-    return N->getKind() == NodeKind::ST_Leaf;
-  }
-
-  /// \returns the end index of this node's substring in the entire string.
-  unsigned getEndIdx() const override {
-    assert(EndIdx && "EndIdx is empty?");
-    return *EndIdx;
-  }
-
-  /// \returns the start index of the suffix represented by this leaf.
-  unsigned getSuffixIdx() const { return SuffixIdx; }
-  /// Sets the start index of the suffix represented by this leaf to \p Idx.
-  void setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; }
-  SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
-      : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
-
-  virtual ~SuffixTreeLeafNode() = default;
-};
-
 /// A data structure for fast substring queries.
 ///
 /// Suffix trees represent the suffixes of their input strings in their leaves.

diff  --git a/llvm/include/llvm/Support/SuffixTreeNode.h b/llvm/include/llvm/Support/SuffixTreeNode.h
new file mode 100644
index 000000000000..7d0d1cf0c58b
--- /dev/null
+++ b/llvm/include/llvm/Support/SuffixTreeNode.h
@@ -0,0 +1,171 @@
+//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines nodes for use within a SuffixTree.
+//
+// Each node has either no children or at least two children, with the root
+// being a exception in the empty tree.
+//
+// Children are represented as a map between unsigned integers and nodes. If
+// a node N has a child M on unsigned integer k, then the mapping represented
+// by N is a proper prefix of the mapping represented by M. Note that this,
+// although similar to a trie is somewhat 
diff erent: each node stores a full
+// substring of the full mapping rather than a single character state.
+//
+// Each internal node contains a pointer to the internal node representing
+// the same string, but with the first character chopped off. This is stored
+// in \p Link. Each leaf node stores the start index of its respective
+// suffix in \p SuffixIdx.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
+#define LLVM_SUPPORT_SUFFIXTREE_NODE_H
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+/// A node in a suffix tree which represents a substring or suffix.
+struct SuffixTreeNode {
+public:
+  /// Represents an undefined index in the suffix tree.
+  static const unsigned EmptyIdx = -1;
+  enum class NodeKind { ST_Leaf, ST_Internal };
+
+private:
+  const NodeKind Kind;
+
+  /// The start index of this node's substring in the main string.
+  unsigned StartIdx = EmptyIdx;
+
+  /// The length of the string formed by concatenating the edge labels from
+  /// the root to this node.
+  unsigned ConcatLen = 0;
+
+public:
+  // LLVM RTTI boilerplate.
+  NodeKind getKind() const { return Kind; }
+
+  /// \return the start index of this node's substring in the entire string.
+  unsigned getStartIdx() const;
+
+  /// \returns the end index of this node.
+  virtual unsigned getEndIdx() const = 0;
+
+  /// Advance this node's StartIdx by \p Inc.
+  void incrementStartIdx(unsigned Inc);
+
+  /// Set the length of the string from the root to this node to \p Len.
+  void setConcatLen(unsigned Len);
+
+  /// \returns the length of the string from the root to this node.
+  unsigned getConcatLen() const;
+
+  SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
+      : Kind(Kind), StartIdx(StartIdx) {}
+  virtual ~SuffixTreeNode() = default;
+};
+
+// A node with two or more children, or the root.
+struct SuffixTreeInternalNode : SuffixTreeNode {
+private:
+  /// The end index of this node's substring in the main string.
+  ///
+  /// Every leaf node must have its \p EndIdx incremented at the end of every
+  /// step in the construction algorithm. To avoid having to update O(N)
+  /// nodes individually at the end of every step, the end index is stored
+  /// as a pointer.
+  unsigned EndIdx = EmptyIdx;
+
+  /// A pointer to the internal node representing the same sequence with the
+  /// first character chopped off.
+  ///
+  /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
+  /// Ukkonen's algorithm does to achieve linear-time construction is
+  /// keep track of which node the next insert should be at. This makes each
+  /// insert O(1), and there are a total of O(N) inserts. The suffix link
+  /// helps with inserting children of internal nodes.
+  ///
+  /// Say we add a child to an internal node with associated mapping S. The
+  /// next insertion must be at the node representing S - its first character.
+  /// This is given by the way that we iteratively build the tree in Ukkonen's
+  /// algorithm. The main idea is to look at the suffixes of each prefix in the
+  /// string, starting with the longest suffix of the prefix, and ending with
+  /// the shortest. Therefore, if we keep pointers between such nodes, we can
+  /// move to the next insertion point in O(1) time. If we don't, then we'd
+  /// have to query from the root, which takes O(N) time. This would make the
+  /// construction algorithm O(N^2) rather than O(N).
+  SuffixTreeInternalNode *Link = nullptr;
+
+public:
+  // LLVM RTTI boilerplate.
+  static bool classof(const SuffixTreeNode *N) {
+    return N->getKind() == NodeKind::ST_Internal;
+  }
+
+  /// \returns true if this node is the root of its owning \p SuffixTree.
+  bool isRoot() const;
+
+  /// \returns the end index of this node's substring in the entire string.
+  unsigned getEndIdx() const override;
+
+  /// Sets \p Link to \p L. Assumes \p L is not null.
+  void setLink(SuffixTreeInternalNode *L);
+
+  /// \returns the pointer to the Link node.
+  SuffixTreeInternalNode *getLink() const;
+
+  /// The children of this node.
+  ///
+  /// A child existing on an unsigned integer implies that from the mapping
+  /// represented by the current node, there is a way to reach another
+  /// mapping by tacking that character on the end of the current string.
+  DenseMap<unsigned, SuffixTreeNode *> Children;
+
+  SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
+                         SuffixTreeInternalNode *Link)
+      : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
+        Link(Link) {}
+
+  virtual ~SuffixTreeInternalNode() = default;
+};
+
+// A node representing a suffix.
+struct SuffixTreeLeafNode : SuffixTreeNode {
+private:
+  /// The start index of the suffix represented by this leaf.
+  unsigned SuffixIdx = EmptyIdx;
+
+  /// The end index of this node's substring in the main string.
+  ///
+  /// Every leaf node must have its \p EndIdx incremented at the end of every
+  /// step in the construction algorithm. To avoid having to update O(N)
+  /// nodes individually at the end of every step, the end index is stored
+  /// as a pointer.
+  unsigned *EndIdx = nullptr;
+
+public:
+  // LLVM RTTI boilerplate.
+  static bool classof(const SuffixTreeNode *N) {
+    return N->getKind() == NodeKind::ST_Leaf;
+  }
+
+  /// \returns the end index of this node's substring in the entire string.
+  unsigned getEndIdx() const override;
+
+  /// \returns the start index of the suffix represented by this leaf.
+  unsigned getSuffixIdx() const;
+
+  /// Sets the start index of the suffix represented by this leaf to \p Idx.
+  void setSuffixIdx(unsigned Idx);
+  SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
+      : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
+
+  virtual ~SuffixTreeLeafNode() = default;
+};
+} // namespace llvm
+#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H
\ No newline at end of file

diff  --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index b000c436670e..714f9ba0214f 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -217,6 +217,7 @@ add_llvm_component_library(LLVMSupport
   StringMap.cpp
   StringSaver.cpp
   StringRef.cpp
+  SuffixTreeNode.cpp
   SuffixTree.cpp
   SystemUtils.cpp
   TarWriter.cpp

diff  --git a/llvm/lib/Support/SuffixTreeNode.cpp b/llvm/lib/Support/SuffixTreeNode.cpp
new file mode 100644
index 000000000000..113b990fd352
--- /dev/null
+++ b/llvm/lib/Support/SuffixTreeNode.cpp
@@ -0,0 +1,40 @@
+//===- llvm/ADT/SuffixTreeNode.cpp - Nodes for SuffixTrees --------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines nodes for use within a SuffixTree.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SuffixTreeNode.h"
+#include "llvm/Support/Casting.h"
+
+using namespace llvm;
+
+unsigned SuffixTreeNode::getStartIdx() const { return StartIdx; }
+void SuffixTreeNode::incrementStartIdx(unsigned Inc) { StartIdx += Inc; }
+void SuffixTreeNode::setConcatLen(unsigned Len) { ConcatLen = Len; }
+unsigned SuffixTreeNode::getConcatLen() const { return ConcatLen; }
+
+bool SuffixTreeInternalNode::isRoot() const {
+  return getStartIdx() == EmptyIdx;
+}
+unsigned SuffixTreeInternalNode::getEndIdx() const { return EndIdx; }
+void SuffixTreeInternalNode::setLink(SuffixTreeInternalNode *L) {
+  assert(L && "Cannot set a null link?");
+  Link = L;
+}
+SuffixTreeInternalNode *SuffixTreeInternalNode::getLink() const { return Link; }
+
+unsigned SuffixTreeLeafNode::getEndIdx() const {
+  assert(EndIdx && "EndIdx is empty?");
+  return *EndIdx;
+}
+
+unsigned SuffixTreeLeafNode::getSuffixIdx() const { return SuffixIdx; }
+void SuffixTreeLeafNode::setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; }


        


More information about the llvm-commits mailing list