[llvm] 6cf993e - [NFC] SuffixTree: Split out SuffixTreeNodes into their own files
Jessica Paquette via llvm-commits
llvm-commits at lists.llvm.org
Thu May 11 21:34:24 PDT 2023
Author: Jessica Paquette
Date: 2023-05-11T21:33:39-07:00
New Revision: 6cf993e59bd22197ebfa074235906a48971a6047
URL: https://github.com/llvm/llvm-project/commit/6cf993e59bd22197ebfa074235906a48971a6047
DIFF: https://github.com/llvm/llvm-project/commit/6cf993e59bd22197ebfa074235906a48971a6047.diff
LOG: [NFC] SuffixTree: Split out SuffixTreeNodes into their own files
Add:
- SuffixTreeNode.h
- SuffixTreeNode.cpp
The SuffixTree file was getting too long.
Added:
llvm/include/llvm/Support/SuffixTreeNode.h
llvm/lib/Support/SuffixTreeNode.cpp
Modified:
llvm/include/llvm/Support/SuffixTree.h
llvm/lib/Support/CMakeLists.txt
Removed:
################################################################################
diff --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h
index 372b83d1172a..189dd041bffe 100644
--- a/llvm/include/llvm/Support/SuffixTree.h
+++ b/llvm/include/llvm/Support/SuffixTree.h
@@ -13,167 +13,12 @@
#define LLVM_SUPPORT_SUFFIXTREE_H
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/SuffixTreeNode.h"
#include <vector>
namespace llvm {
-
-/// A node in a suffix tree which represents a substring or suffix.
-///
-/// Each node has either no children or at least two children, with the root
-/// being a exception in the empty tree.
-///
-/// Children are represented as a map between unsigned integers and nodes. If
-/// a node N has a child M on unsigned integer k, then the mapping represented
-/// by N is a proper prefix of the mapping represented by M. Note that this,
-/// although similar to a trie is somewhat
diff erent: each node stores a full
-/// substring of the full mapping rather than a single character state.
-///
-/// Each internal node contains a pointer to the internal node representing
-/// the same string, but with the first character chopped off. This is stored
-/// in \p Link. Each leaf node stores the start index of its respective
-/// suffix in \p SuffixIdx.
-struct SuffixTreeNode {
-public:
- /// Represents an undefined index in the suffix tree.
- static const unsigned EmptyIdx = -1;
- enum class NodeKind { ST_Leaf, ST_Internal };
-
-private:
- const NodeKind Kind;
- /// The start index of this node's substring in the main string.
- unsigned StartIdx = EmptyIdx;
-
- /// The length of the string formed by concatenating the edge labels from
- /// the root to this node.
- unsigned ConcatLen = 0;
-
-public:
- NodeKind getKind() const { return Kind; }
-
- /// \return the start index of this node's substring in the entire string.
- virtual unsigned getStartIdx() const { return StartIdx; }
-
- /// \returns the end index of this node.
- virtual unsigned getEndIdx() const = 0;
-
- /// Advance this node's StartIdx by \p Inc.
- void incrementStartIdx(unsigned Inc) { StartIdx += Inc; }
-
- /// Set the length of the string from the root to this node to \p Len.
- void setConcatLen(unsigned Len) { ConcatLen = Len; }
-
- /// \returns the length of the string from the root to this node.
- unsigned getConcatLen() const { return ConcatLen; }
-
- SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
- : Kind(Kind), StartIdx(StartIdx) {}
- virtual ~SuffixTreeNode() = default;
-};
-
-struct SuffixTreeInternalNode : SuffixTreeNode {
-private:
- /// The end index of this node's substring in the main string.
- ///
- /// Every leaf node must have its \p EndIdx incremented at the end of every
- /// step in the construction algorithm. To avoid having to update O(N)
- /// nodes individually at the end of every step, the end index is stored
- /// as a pointer.
- unsigned EndIdx = EmptyIdx;
-
- /// A pointer to the internal node representing the same sequence with the
- /// first character chopped off.
- ///
- /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
- /// Ukkonen's algorithm does to achieve linear-time construction is
- /// keep track of which node the next insert should be at. This makes each
- /// insert O(1), and there are a total of O(N) inserts. The suffix link
- /// helps with inserting children of internal nodes.
- ///
- /// Say we add a child to an internal node with associated mapping S. The
- /// next insertion must be at the node representing S - its first character.
- /// This is given by the way that we iteratively build the tree in Ukkonen's
- /// algorithm. The main idea is to look at the suffixes of each prefix in the
- /// string, starting with the longest suffix of the prefix, and ending with
- /// the shortest. Therefore, if we keep pointers between such nodes, we can
- /// move to the next insertion point in O(1) time. If we don't, then we'd
- /// have to query from the root, which takes O(N) time. This would make the
- /// construction algorithm O(N^2) rather than O(N).
- SuffixTreeInternalNode *Link = nullptr;
-
-public:
- static bool classof(const SuffixTreeNode *N) {
- return N->getKind() == NodeKind::ST_Internal;
- }
-
- /// \returns true if this node is the root of its owning \p SuffixTree.
- bool isRoot() const { return getStartIdx() == EmptyIdx; }
-
- /// \returns the end index of this node's substring in the entire string.
- unsigned getEndIdx() const override { return EndIdx; }
-
- /// Sets \p Link to \p L. Assumes \p L is not null.
- void setLink(SuffixTreeInternalNode *L) {
- assert(L && "Cannot set a null link?");
- Link = L;
- }
-
- /// \returns the pointer to the Link node.
- SuffixTreeInternalNode *getLink() const {
- return Link;
- }
-
- /// The children of this node.
- ///
- /// A child existing on an unsigned integer implies that from the mapping
- /// represented by the current node, there is a way to reach another
- /// mapping by tacking that character on the end of the current string.
- DenseMap<unsigned, SuffixTreeNode *> Children;
-
- SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
- SuffixTreeInternalNode *Link)
- : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
- Link(Link) {}
-
- virtual ~SuffixTreeInternalNode() = default;
-};
-
-struct SuffixTreeLeafNode : SuffixTreeNode {
-private:
- /// The start index of the suffix represented by this leaf.
- unsigned SuffixIdx = EmptyIdx;
-
- /// The end index of this node's substring in the main string.
- ///
- /// Every leaf node must have its \p EndIdx incremented at the end of every
- /// step in the construction algorithm. To avoid having to update O(N)
- /// nodes individually at the end of every step, the end index is stored
- /// as a pointer.
- unsigned *EndIdx = nullptr;
-
-public:
- static bool classof(const SuffixTreeNode *N) {
- return N->getKind() == NodeKind::ST_Leaf;
- }
-
- /// \returns the end index of this node's substring in the entire string.
- unsigned getEndIdx() const override {
- assert(EndIdx && "EndIdx is empty?");
- return *EndIdx;
- }
-
- /// \returns the start index of the suffix represented by this leaf.
- unsigned getSuffixIdx() const { return SuffixIdx; }
- /// Sets the start index of the suffix represented by this leaf to \p Idx.
- void setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; }
- SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
- : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
-
- virtual ~SuffixTreeLeafNode() = default;
-};
-
/// A data structure for fast substring queries.
///
/// Suffix trees represent the suffixes of their input strings in their leaves.
diff --git a/llvm/include/llvm/Support/SuffixTreeNode.h b/llvm/include/llvm/Support/SuffixTreeNode.h
new file mode 100644
index 000000000000..7d0d1cf0c58b
--- /dev/null
+++ b/llvm/include/llvm/Support/SuffixTreeNode.h
@@ -0,0 +1,171 @@
+//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines nodes for use within a SuffixTree.
+//
+// Each node has either no children or at least two children, with the root
+// being a exception in the empty tree.
+//
+// Children are represented as a map between unsigned integers and nodes. If
+// a node N has a child M on unsigned integer k, then the mapping represented
+// by N is a proper prefix of the mapping represented by M. Note that this,
+// although similar to a trie is somewhat
diff erent: each node stores a full
+// substring of the full mapping rather than a single character state.
+//
+// Each internal node contains a pointer to the internal node representing
+// the same string, but with the first character chopped off. This is stored
+// in \p Link. Each leaf node stores the start index of its respective
+// suffix in \p SuffixIdx.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
+#define LLVM_SUPPORT_SUFFIXTREE_NODE_H
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+/// A node in a suffix tree which represents a substring or suffix.
+struct SuffixTreeNode {
+public:
+ /// Represents an undefined index in the suffix tree.
+ static const unsigned EmptyIdx = -1;
+ enum class NodeKind { ST_Leaf, ST_Internal };
+
+private:
+ const NodeKind Kind;
+
+ /// The start index of this node's substring in the main string.
+ unsigned StartIdx = EmptyIdx;
+
+ /// The length of the string formed by concatenating the edge labels from
+ /// the root to this node.
+ unsigned ConcatLen = 0;
+
+public:
+ // LLVM RTTI boilerplate.
+ NodeKind getKind() const { return Kind; }
+
+ /// \return the start index of this node's substring in the entire string.
+ unsigned getStartIdx() const;
+
+ /// \returns the end index of this node.
+ virtual unsigned getEndIdx() const = 0;
+
+ /// Advance this node's StartIdx by \p Inc.
+ void incrementStartIdx(unsigned Inc);
+
+ /// Set the length of the string from the root to this node to \p Len.
+ void setConcatLen(unsigned Len);
+
+ /// \returns the length of the string from the root to this node.
+ unsigned getConcatLen() const;
+
+ SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
+ : Kind(Kind), StartIdx(StartIdx) {}
+ virtual ~SuffixTreeNode() = default;
+};
+
+// A node with two or more children, or the root.
+struct SuffixTreeInternalNode : SuffixTreeNode {
+private:
+ /// The end index of this node's substring in the main string.
+ ///
+ /// Every leaf node must have its \p EndIdx incremented at the end of every
+ /// step in the construction algorithm. To avoid having to update O(N)
+ /// nodes individually at the end of every step, the end index is stored
+ /// as a pointer.
+ unsigned EndIdx = EmptyIdx;
+
+ /// A pointer to the internal node representing the same sequence with the
+ /// first character chopped off.
+ ///
+ /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
+ /// Ukkonen's algorithm does to achieve linear-time construction is
+ /// keep track of which node the next insert should be at. This makes each
+ /// insert O(1), and there are a total of O(N) inserts. The suffix link
+ /// helps with inserting children of internal nodes.
+ ///
+ /// Say we add a child to an internal node with associated mapping S. The
+ /// next insertion must be at the node representing S - its first character.
+ /// This is given by the way that we iteratively build the tree in Ukkonen's
+ /// algorithm. The main idea is to look at the suffixes of each prefix in the
+ /// string, starting with the longest suffix of the prefix, and ending with
+ /// the shortest. Therefore, if we keep pointers between such nodes, we can
+ /// move to the next insertion point in O(1) time. If we don't, then we'd
+ /// have to query from the root, which takes O(N) time. This would make the
+ /// construction algorithm O(N^2) rather than O(N).
+ SuffixTreeInternalNode *Link = nullptr;
+
+public:
+ // LLVM RTTI boilerplate.
+ static bool classof(const SuffixTreeNode *N) {
+ return N->getKind() == NodeKind::ST_Internal;
+ }
+
+ /// \returns true if this node is the root of its owning \p SuffixTree.
+ bool isRoot() const;
+
+ /// \returns the end index of this node's substring in the entire string.
+ unsigned getEndIdx() const override;
+
+ /// Sets \p Link to \p L. Assumes \p L is not null.
+ void setLink(SuffixTreeInternalNode *L);
+
+ /// \returns the pointer to the Link node.
+ SuffixTreeInternalNode *getLink() const;
+
+ /// The children of this node.
+ ///
+ /// A child existing on an unsigned integer implies that from the mapping
+ /// represented by the current node, there is a way to reach another
+ /// mapping by tacking that character on the end of the current string.
+ DenseMap<unsigned, SuffixTreeNode *> Children;
+
+ SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
+ SuffixTreeInternalNode *Link)
+ : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
+ Link(Link) {}
+
+ virtual ~SuffixTreeInternalNode() = default;
+};
+
+// A node representing a suffix.
+struct SuffixTreeLeafNode : SuffixTreeNode {
+private:
+ /// The start index of the suffix represented by this leaf.
+ unsigned SuffixIdx = EmptyIdx;
+
+ /// The end index of this node's substring in the main string.
+ ///
+ /// Every leaf node must have its \p EndIdx incremented at the end of every
+ /// step in the construction algorithm. To avoid having to update O(N)
+ /// nodes individually at the end of every step, the end index is stored
+ /// as a pointer.
+ unsigned *EndIdx = nullptr;
+
+public:
+ // LLVM RTTI boilerplate.
+ static bool classof(const SuffixTreeNode *N) {
+ return N->getKind() == NodeKind::ST_Leaf;
+ }
+
+ /// \returns the end index of this node's substring in the entire string.
+ unsigned getEndIdx() const override;
+
+ /// \returns the start index of the suffix represented by this leaf.
+ unsigned getSuffixIdx() const;
+
+ /// Sets the start index of the suffix represented by this leaf to \p Idx.
+ void setSuffixIdx(unsigned Idx);
+ SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
+ : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
+
+ virtual ~SuffixTreeLeafNode() = default;
+};
+} // namespace llvm
+#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H
\ No newline at end of file
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index b000c436670e..714f9ba0214f 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -217,6 +217,7 @@ add_llvm_component_library(LLVMSupport
StringMap.cpp
StringSaver.cpp
StringRef.cpp
+ SuffixTreeNode.cpp
SuffixTree.cpp
SystemUtils.cpp
TarWriter.cpp
diff --git a/llvm/lib/Support/SuffixTreeNode.cpp b/llvm/lib/Support/SuffixTreeNode.cpp
new file mode 100644
index 000000000000..113b990fd352
--- /dev/null
+++ b/llvm/lib/Support/SuffixTreeNode.cpp
@@ -0,0 +1,40 @@
+//===- llvm/ADT/SuffixTreeNode.cpp - Nodes for SuffixTrees --------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines nodes for use within a SuffixTree.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SuffixTreeNode.h"
+#include "llvm/Support/Casting.h"
+
+using namespace llvm;
+
+unsigned SuffixTreeNode::getStartIdx() const { return StartIdx; }
+void SuffixTreeNode::incrementStartIdx(unsigned Inc) { StartIdx += Inc; }
+void SuffixTreeNode::setConcatLen(unsigned Len) { ConcatLen = Len; }
+unsigned SuffixTreeNode::getConcatLen() const { return ConcatLen; }
+
+bool SuffixTreeInternalNode::isRoot() const {
+ return getStartIdx() == EmptyIdx;
+}
+unsigned SuffixTreeInternalNode::getEndIdx() const { return EndIdx; }
+void SuffixTreeInternalNode::setLink(SuffixTreeInternalNode *L) {
+ assert(L && "Cannot set a null link?");
+ Link = L;
+}
+SuffixTreeInternalNode *SuffixTreeInternalNode::getLink() const { return Link; }
+
+unsigned SuffixTreeLeafNode::getEndIdx() const {
+ assert(EndIdx && "EndIdx is empty?");
+ return *EndIdx;
+}
+
+unsigned SuffixTreeLeafNode::getSuffixIdx() const { return SuffixIdx; }
+void SuffixTreeLeafNode::setSuffixIdx(unsigned Idx) { SuffixIdx = Idx; }
More information about the llvm-commits
mailing list