[llvm] bb677ca - [SuffixTree][MachOpt] Factoring out Suffix Tree and adding Unit Tests

Andrew Litteken via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 8 12:44:29 PDT 2020


Author: Andrew Litteken
Date: 2020-06-08T12:44:18-07:00
New Revision: bb677cacc803022211827509754a993ab2febf93

URL: https://github.com/llvm/llvm-project/commit/bb677cacc803022211827509754a993ab2febf93
DIFF: https://github.com/llvm/llvm-project/commit/bb677cacc803022211827509754a993ab2febf93.diff

LOG: [SuffixTree][MachOpt] Factoring out Suffix Tree and adding Unit Tests

This moves the SuffixTree test used in the Machine Outliner and moves it into Support for use in other outliners elsewhere in the compilation pipeline.

Differential Revision: https://reviews.llvm.org/D80586

Added: 
    llvm/include/llvm/Support/SuffixTree.h
    llvm/lib/Support/SuffixTree.cpp
    llvm/unittests/Support/SuffixTreeTest.cpp

Modified: 
    llvm/lib/CodeGen/MachineOutliner.cpp
    llvm/lib/Support/CMakeLists.txt
    llvm/unittests/Support/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h
new file mode 100644
index 000000000000..67d513d032ce
--- /dev/null
+++ b/llvm/include/llvm/Support/SuffixTree.h
@@ -0,0 +1,350 @@
+//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Suffix Tree class and Suffix Tree Node struct.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_SUPPORT_SUFFIXTREE_H
+#define LLVM_SUPPORT_SUFFIXTREE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+namespace llvm {
+
+/// Represents an undefined index in the suffix tree.
+const unsigned EmptyIdx = -1;
+
+/// A node in a suffix tree which represents a substring or suffix.
+///
+/// Each node has either no children or at least two children, with the root
+/// being a exception in the empty tree.
+///
+/// Children are represented as a map between unsigned integers and nodes. If
+/// a node N has a child M on unsigned integer k, then the mapping represented
+/// by N is a proper prefix of the mapping represented by M. Note that this,
+/// although similar to a trie is somewhat 
diff erent: each node stores a full
+/// substring of the full mapping rather than a single character state.
+///
+/// Each internal node contains a pointer to the internal node representing
+/// the same string, but with the first character chopped off. This is stored
+/// in \p Link. Each leaf node stores the start index of its respective
+/// suffix in \p SuffixIdx.
+struct SuffixTreeNode {
+
+  /// The children of this node.
+  ///
+  /// A child existing on an unsigned integer implies that from the mapping
+  /// represented by the current node, there is a way to reach another
+  /// mapping by tacking that character on the end of the current string.
+  llvm::DenseMap<unsigned, SuffixTreeNode *> Children;
+
+  /// The start index of this node's substring in the main string.
+  unsigned StartIdx = EmptyIdx;
+
+  /// The end index of this node's substring in the main string.
+  ///
+  /// Every leaf node must have its \p EndIdx incremented at the end of every
+  /// step in the construction algorithm. To avoid having to update O(N)
+  /// nodes individually at the end of every step, the end index is stored
+  /// as a pointer.
+  unsigned *EndIdx = nullptr;
+
+  /// For leaves, the start index of the suffix represented by this node.
+  ///
+  /// For all other nodes, this is ignored.
+  unsigned SuffixIdx = EmptyIdx;
+
+  /// For internal nodes, a pointer to the internal node representing
+  /// the same sequence with the first character chopped off.
+  ///
+  /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
+  /// Ukkonen's algorithm does to achieve linear-time construction is
+  /// keep track of which node the next insert should be at. This makes each
+  /// insert O(1), and there are a total of O(N) inserts. The suffix link
+  /// helps with inserting children of internal nodes.
+  ///
+  /// Say we add a child to an internal node with associated mapping S. The
+  /// next insertion must be at the node representing S - its first character.
+  /// This is given by the way that we iteratively build the tree in Ukkonen's
+  /// algorithm. The main idea is to look at the suffixes of each prefix in the
+  /// string, starting with the longest suffix of the prefix, and ending with
+  /// the shortest. Therefore, if we keep pointers between such nodes, we can
+  /// move to the next insertion point in O(1) time. If we don't, then we'd
+  /// have to query from the root, which takes O(N) time. This would make the
+  /// construction algorithm O(N^2) rather than O(N).
+  SuffixTreeNode *Link = nullptr;
+
+  /// The length of the string formed by concatenating the edge labels from the
+  /// root to this node.
+  unsigned ConcatLen = 0;
+
+  /// Returns true if this node is a leaf.
+  bool isLeaf() const { return SuffixIdx != EmptyIdx; }
+
+  /// Returns true if this node is the root of its owning \p SuffixTree.
+  bool isRoot() const { return StartIdx == EmptyIdx; }
+
+  /// Return the number of elements in the substring associated with this node.
+  size_t size() const {
+
+    // Is it the root? If so, it's the empty string so return 0.
+    if (isRoot())
+      return 0;
+
+    assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
+
+    // Size = the number of elements in the string.
+    // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
+    return *EndIdx - StartIdx + 1;
+  }
+
+  SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
+      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
+
+  SuffixTreeNode() {}
+};
+
+/// A data structure for fast substring queries.
+///
+/// Suffix trees represent the suffixes of their input strings in their leaves.
+/// A suffix tree is a type of compressed trie structure where each node
+/// represents an entire substring rather than a single character. Each leaf
+/// of the tree is a suffix.
+///
+/// A suffix tree can be seen as a type of state machine where each state is a
+/// substring of the full string. The tree is structured so that, for a string
+/// of length N, there are exactly N leaves in the tree. This structure allows
+/// us to quickly find repeated substrings of the input string.
+///
+/// In this implementation, a "string" is a vector of unsigned integers.
+/// These integers may result from hashing some data type. A suffix tree can
+/// contain 1 or many strings, which can then be queried as one large string.
+///
+/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
+/// suffix tree construction. Ukkonen's algorithm is explained in more detail
+/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
+/// paper is available at
+///
+/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+class SuffixTree {
+public:
+  /// Each element is an integer representing an instruction in the module.
+  llvm::ArrayRef<unsigned> Str;
+
+  /// A repeated substring in the tree.
+  struct RepeatedSubstring {
+    /// The length of the string.
+    unsigned Length;
+
+    /// The start indices of each occurrence.
+    std::vector<unsigned> StartIndices;
+  };
+
+private:
+  /// Maintains each node in the tree.
+  llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
+
+  /// The root of the suffix tree.
+  ///
+  /// The root represents the empty string. It is maintained by the
+  /// \p NodeAllocator like every other node in the tree.
+  SuffixTreeNode *Root = nullptr;
+
+  /// Maintains the end indices of the internal nodes in the tree.
+  ///
+  /// Each internal node is guaranteed to never have its end index change
+  /// during the construction algorithm; however, leaves must be updated at
+  /// every step. Therefore, we need to store leaf end indices by reference
+  /// to avoid updating O(N) leaves at every step of construction. Thus,
+  /// every internal node must be allocated its own end index.
+  llvm::BumpPtrAllocator InternalEndIdxAllocator;
+
+  /// The end index of each leaf in the tree.
+  unsigned LeafEndIdx = -1;
+
+  /// Helper struct which keeps track of the next insertion point in
+  /// Ukkonen's algorithm.
+  struct ActiveState {
+    /// The next node to insert at.
+    SuffixTreeNode *Node = nullptr;
+
+    /// The index of the first character in the substring currently being added.
+    unsigned Idx = EmptyIdx;
+
+    /// The length of the substring we have to add at the current step.
+    unsigned Len = 0;
+  };
+
+  /// The point the next insertion will take place at in the
+  /// construction algorithm.
+  ActiveState Active;
+
+  /// Allocate a leaf node and add it to the tree.
+  ///
+  /// \param Parent The parent of this node.
+  /// \param StartIdx The start index of this node's associated string.
+  /// \param Edge The label on the edge leaving \p Parent to this node.
+  ///
+  /// \returns A pointer to the allocated leaf node.
+  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
+                             unsigned Edge);
+
+  /// Allocate an internal node and add it to the tree.
+  ///
+  /// \param Parent The parent of this node. Only null when allocating the root.
+  /// \param StartIdx The start index of this node's associated string.
+  /// \param EndIdx The end index of this node's associated string.
+  /// \param Edge The label on the edge leaving \p Parent to this node.
+  ///
+  /// \returns A pointer to the allocated internal node.
+  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
+                                     unsigned EndIdx, unsigned Edge);
+
+  /// Set the suffix indices of the leaves to the start indices of their
+  /// respective suffixes.
+  void setSuffixIndices();
+
+  /// Construct the suffix tree for the prefix of the input ending at
+  /// \p EndIdx.
+  ///
+  /// Used to construct the full suffix tree iteratively. At the end of each
+  /// step, the constructed suffix tree is either a valid suffix tree, or a
+  /// suffix tree with implicit suffixes. At the end of the final step, the
+  /// suffix tree is a valid tree.
+  ///
+  /// \param EndIdx The end index of the current prefix in the main string.
+  /// \param SuffixesToAdd The number of suffixes that must be added
+  /// to complete the suffix tree at the current phase.
+  ///
+  /// \returns The number of suffixes that have not been added at the end of
+  /// this step.
+  unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
+
+public:
+  /// Construct a suffix tree from a sequence of unsigned integers.
+  ///
+  /// \param Str The string to construct the suffix tree for.
+  SuffixTree(const std::vector<unsigned> &Str);
+
+  /// Iterator for finding all repeated substrings in the suffix tree.
+  struct RepeatedSubstringIterator {
+  private:
+    /// The current node we're visiting.
+    SuffixTreeNode *N = nullptr;
+
+    /// The repeated substring associated with this node.
+    RepeatedSubstring RS;
+
+    /// The nodes left to visit.
+    std::vector<SuffixTreeNode *> ToVisit;
+
+    /// The minimum length of a repeated substring to find.
+    /// Since we're outlining, we want at least two instructions in the range.
+    /// FIXME: This may not be true for targets like X86 which support many
+    /// instruction lengths.
+    const unsigned MinLength = 2;
+
+    /// Move the iterator to the next repeated substring.
+    void advance() {
+      // Clear the current state. If we're at the end of the range, then this
+      // is the state we want to be in.
+      RS = RepeatedSubstring();
+      N = nullptr;
+
+      // Each leaf node represents a repeat of a string.
+      std::vector<SuffixTreeNode *> LeafChildren;
+
+      // Continue visiting nodes until we find one which repeats more than once.
+      while (!ToVisit.empty()) {
+        SuffixTreeNode *Curr = ToVisit.back();
+        ToVisit.pop_back();
+        LeafChildren.clear();
+
+        // Keep track of the length of the string associated with the node. If
+        // it's too short, we'll quit.
+        unsigned Length = Curr->ConcatLen;
+
+        // Iterate over each child, saving internal nodes for visiting, and
+        // leaf nodes in LeafChildren. Internal nodes represent individual
+        // strings, which may repeat.
+        for (auto &ChildPair : Curr->Children) {
+          // Save all of this node's children for processing.
+          if (!ChildPair.second->isLeaf())
+            ToVisit.push_back(ChildPair.second);
+
+          // It's not an internal node, so it must be a leaf. If we have a
+          // long enough string, then save the leaf children.
+          else if (Length >= MinLength)
+            LeafChildren.push_back(ChildPair.second);
+        }
+
+        // The root never represents a repeated substring. If we're looking at
+        // that, then skip it.
+        if (Curr->isRoot())
+          continue;
+
+        // Do we have any repeated substrings?
+        if (LeafChildren.size() >= 2) {
+          // Yes. Update the state to reflect this, and then bail out.
+          N = Curr;
+          RS.Length = Length;
+          for (SuffixTreeNode *Leaf : LeafChildren)
+            RS.StartIndices.push_back(Leaf->SuffixIdx);
+          break;
+        }
+      }
+
+      // At this point, either NewRS is an empty RepeatedSubstring, or it was
+      // set in the above loop. Similarly, N is either nullptr, or the node
+      // associated with NewRS.
+    }
+
+  public:
+    /// Return the current repeated substring.
+    RepeatedSubstring &operator*() { return RS; }
+
+    RepeatedSubstringIterator &operator++() {
+      advance();
+      return *this;
+    }
+
+    RepeatedSubstringIterator operator++(int I) {
+      RepeatedSubstringIterator It(*this);
+      advance();
+      return It;
+    }
+
+    bool operator==(const RepeatedSubstringIterator &Other) {
+      return N == Other.N;
+    }
+    bool operator!=(const RepeatedSubstringIterator &Other) {
+      return !(*this == Other);
+    }
+
+    RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
+      // Do we have a non-null node?
+      if (N) {
+        // Yes. At the first step, we need to visit all of N's children.
+        // Note: This means that we visit N last.
+        ToVisit.push_back(N);
+        advance();
+      }
+    }
+  };
+
+  typedef RepeatedSubstringIterator iterator;
+  iterator begin() { return iterator(Root); }
+  iterator end() { return iterator(nullptr); }
+};
+
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_SUFFIXTREE_H

diff  --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 8bd02f0ed24a..d70638bd3840 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -70,9 +70,9 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/SuffixTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include <functional>
 #include <tuple>
@@ -107,513 +107,6 @@ static cl::opt<unsigned> OutlinerReruns(
 
 namespace {
 
-/// Represents an undefined index in the suffix tree.
-const unsigned EmptyIdx = -1;
-
-/// A node in a suffix tree which represents a substring or suffix.
-///
-/// Each node has either no children or at least two children, with the root
-/// being a exception in the empty tree.
-///
-/// Children are represented as a map between unsigned integers and nodes. If
-/// a node N has a child M on unsigned integer k, then the mapping represented
-/// by N is a proper prefix of the mapping represented by M. Note that this,
-/// although similar to a trie is somewhat 
diff erent: each node stores a full
-/// substring of the full mapping rather than a single character state.
-///
-/// Each internal node contains a pointer to the internal node representing
-/// the same string, but with the first character chopped off. This is stored
-/// in \p Link. Each leaf node stores the start index of its respective
-/// suffix in \p SuffixIdx.
-struct SuffixTreeNode {
-
-  /// The children of this node.
-  ///
-  /// A child existing on an unsigned integer implies that from the mapping
-  /// represented by the current node, there is a way to reach another
-  /// mapping by tacking that character on the end of the current string.
-  DenseMap<unsigned, SuffixTreeNode *> Children;
-
-  /// The start index of this node's substring in the main string.
-  unsigned StartIdx = EmptyIdx;
-
-  /// The end index of this node's substring in the main string.
-  ///
-  /// Every leaf node must have its \p EndIdx incremented at the end of every
-  /// step in the construction algorithm. To avoid having to update O(N)
-  /// nodes individually at the end of every step, the end index is stored
-  /// as a pointer.
-  unsigned *EndIdx = nullptr;
-
-  /// For leaves, the start index of the suffix represented by this node.
-  ///
-  /// For all other nodes, this is ignored.
-  unsigned SuffixIdx = EmptyIdx;
-
-  /// For internal nodes, a pointer to the internal node representing
-  /// the same sequence with the first character chopped off.
-  ///
-  /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
-  /// Ukkonen's algorithm does to achieve linear-time construction is
-  /// keep track of which node the next insert should be at. This makes each
-  /// insert O(1), and there are a total of O(N) inserts. The suffix link
-  /// helps with inserting children of internal nodes.
-  ///
-  /// Say we add a child to an internal node with associated mapping S. The
-  /// next insertion must be at the node representing S - its first character.
-  /// This is given by the way that we iteratively build the tree in Ukkonen's
-  /// algorithm. The main idea is to look at the suffixes of each prefix in the
-  /// string, starting with the longest suffix of the prefix, and ending with
-  /// the shortest. Therefore, if we keep pointers between such nodes, we can
-  /// move to the next insertion point in O(1) time. If we don't, then we'd
-  /// have to query from the root, which takes O(N) time. This would make the
-  /// construction algorithm O(N^2) rather than O(N).
-  SuffixTreeNode *Link = nullptr;
-
-  /// The length of the string formed by concatenating the edge labels from the
-  /// root to this node.
-  unsigned ConcatLen = 0;
-
-  /// Returns true if this node is a leaf.
-  bool isLeaf() const { return SuffixIdx != EmptyIdx; }
-
-  /// Returns true if this node is the root of its owning \p SuffixTree.
-  bool isRoot() const { return StartIdx == EmptyIdx; }
-
-  /// Return the number of elements in the substring associated with this node.
-  size_t size() const {
-
-    // Is it the root? If so, it's the empty string so return 0.
-    if (isRoot())
-      return 0;
-
-    assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
-
-    // Size = the number of elements in the string.
-    // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
-    return *EndIdx - StartIdx + 1;
-  }
-
-  SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
-      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
-
-  SuffixTreeNode() {}
-};
-
-/// A data structure for fast substring queries.
-///
-/// Suffix trees represent the suffixes of their input strings in their leaves.
-/// A suffix tree is a type of compressed trie structure where each node
-/// represents an entire substring rather than a single character. Each leaf
-/// of the tree is a suffix.
-///
-/// A suffix tree can be seen as a type of state machine where each state is a
-/// substring of the full string. The tree is structured so that, for a string
-/// of length N, there are exactly N leaves in the tree. This structure allows
-/// us to quickly find repeated substrings of the input string.
-///
-/// In this implementation, a "string" is a vector of unsigned integers.
-/// These integers may result from hashing some data type. A suffix tree can
-/// contain 1 or many strings, which can then be queried as one large string.
-///
-/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
-/// suffix tree construction. Ukkonen's algorithm is explained in more detail
-/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
-/// paper is available at
-///
-/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
-class SuffixTree {
-public:
-  /// Each element is an integer representing an instruction in the module.
-  ArrayRef<unsigned> Str;
-
-  /// A repeated substring in the tree.
-  struct RepeatedSubstring {
-    /// The length of the string.
-    unsigned Length;
-
-    /// The start indices of each occurrence.
-    std::vector<unsigned> StartIndices;
-  };
-
-private:
-  /// Maintains each node in the tree.
-  SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
-
-  /// The root of the suffix tree.
-  ///
-  /// The root represents the empty string. It is maintained by the
-  /// \p NodeAllocator like every other node in the tree.
-  SuffixTreeNode *Root = nullptr;
-
-  /// Maintains the end indices of the internal nodes in the tree.
-  ///
-  /// Each internal node is guaranteed to never have its end index change
-  /// during the construction algorithm; however, leaves must be updated at
-  /// every step. Therefore, we need to store leaf end indices by reference
-  /// to avoid updating O(N) leaves at every step of construction. Thus,
-  /// every internal node must be allocated its own end index.
-  BumpPtrAllocator InternalEndIdxAllocator;
-
-  /// The end index of each leaf in the tree.
-  unsigned LeafEndIdx = -1;
-
-  /// Helper struct which keeps track of the next insertion point in
-  /// Ukkonen's algorithm.
-  struct ActiveState {
-    /// The next node to insert at.
-    SuffixTreeNode *Node = nullptr;
-
-    /// The index of the first character in the substring currently being added.
-    unsigned Idx = EmptyIdx;
-
-    /// The length of the substring we have to add at the current step.
-    unsigned Len = 0;
-  };
-
-  /// The point the next insertion will take place at in the
-  /// construction algorithm.
-  ActiveState Active;
-
-  /// Allocate a leaf node and add it to the tree.
-  ///
-  /// \param Parent The parent of this node.
-  /// \param StartIdx The start index of this node's associated string.
-  /// \param Edge The label on the edge leaving \p Parent to this node.
-  ///
-  /// \returns A pointer to the allocated leaf node.
-  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
-                             unsigned Edge) {
-
-    assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
-
-    SuffixTreeNode *N = new (NodeAllocator.Allocate())
-        SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
-    Parent.Children[Edge] = N;
-
-    return N;
-  }
-
-  /// Allocate an internal node and add it to the tree.
-  ///
-  /// \param Parent The parent of this node. Only null when allocating the root.
-  /// \param StartIdx The start index of this node's associated string.
-  /// \param EndIdx The end index of this node's associated string.
-  /// \param Edge The label on the edge leaving \p Parent to this node.
-  ///
-  /// \returns A pointer to the allocated internal node.
-  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
-                                     unsigned EndIdx, unsigned Edge) {
-
-    assert(StartIdx <= EndIdx && "String can't start after it ends!");
-    assert(!(!Parent && StartIdx != EmptyIdx) &&
-           "Non-root internal nodes must have parents!");
-
-    unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
-    SuffixTreeNode *N =
-        new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
-    if (Parent)
-      Parent->Children[Edge] = N;
-
-    return N;
-  }
-
-  /// Set the suffix indices of the leaves to the start indices of their
-  /// respective suffixes.
-  void setSuffixIndices() {
-    // List of nodes we need to visit along with the current length of the
-    // string.
-    std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
-
-    // Current node being visited.
-    SuffixTreeNode *CurrNode = Root;
-
-    // Sum of the lengths of the nodes down the path to the current one.
-    unsigned CurrNodeLen = 0;
-    ToVisit.push_back({CurrNode, CurrNodeLen});
-    while (!ToVisit.empty()) {
-      std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
-      ToVisit.pop_back();
-      CurrNode->ConcatLen = CurrNodeLen;
-      for (auto &ChildPair : CurrNode->Children) {
-        assert(ChildPair.second && "Node had a null child!");
-        ToVisit.push_back(
-            {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
-      }
-
-      // No children, so we are at the end of the string.
-      if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
-        CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
-    }
-  }
-
-  /// Construct the suffix tree for the prefix of the input ending at
-  /// \p EndIdx.
-  ///
-  /// Used to construct the full suffix tree iteratively. At the end of each
-  /// step, the constructed suffix tree is either a valid suffix tree, or a
-  /// suffix tree with implicit suffixes. At the end of the final step, the
-  /// suffix tree is a valid tree.
-  ///
-  /// \param EndIdx The end index of the current prefix in the main string.
-  /// \param SuffixesToAdd The number of suffixes that must be added
-  /// to complete the suffix tree at the current phase.
-  ///
-  /// \returns The number of suffixes that have not been added at the end of
-  /// this step.
-  unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd) {
-    SuffixTreeNode *NeedsLink = nullptr;
-
-    while (SuffixesToAdd > 0) {
-
-      // Are we waiting to add anything other than just the last character?
-      if (Active.Len == 0) {
-        // If not, then say the active index is the end index.
-        Active.Idx = EndIdx;
-      }
-
-      assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
-
-      // The first character in the current substring we're looking at.
-      unsigned FirstChar = Str[Active.Idx];
-
-      // Have we inserted anything starting with FirstChar at the current node?
-      if (Active.Node->Children.count(FirstChar) == 0) {
-        // If not, then we can just insert a leaf and move too the next step.
-        insertLeaf(*Active.Node, EndIdx, FirstChar);
-
-        // The active node is an internal node, and we visited it, so it must
-        // need a link if it doesn't have one.
-        if (NeedsLink) {
-          NeedsLink->Link = Active.Node;
-          NeedsLink = nullptr;
-        }
-      } else {
-        // There's a match with FirstChar, so look for the point in the tree to
-        // insert a new node.
-        SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
-
-        unsigned SubstringLen = NextNode->size();
-
-        // Is the current suffix we're trying to insert longer than the size of
-        // the child we want to move to?
-        if (Active.Len >= SubstringLen) {
-          // If yes, then consume the characters we've seen and move to the next
-          // node.
-          Active.Idx += SubstringLen;
-          Active.Len -= SubstringLen;
-          Active.Node = NextNode;
-          continue;
-        }
-
-        // Otherwise, the suffix we're trying to insert must be contained in the
-        // next node we want to move to.
-        unsigned LastChar = Str[EndIdx];
-
-        // Is the string we're trying to insert a substring of the next node?
-        if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
-          // If yes, then we're done for this step. Remember our insertion point
-          // and move to the next end index. At this point, we have an implicit
-          // suffix tree.
-          if (NeedsLink && !Active.Node->isRoot()) {
-            NeedsLink->Link = Active.Node;
-            NeedsLink = nullptr;
-          }
-
-          Active.Len++;
-          break;
-        }
-
-        // The string we're trying to insert isn't a substring of the next node,
-        // but matches up to a point. Split the node.
-        //
-        // For example, say we ended our search at a node n and we're trying to
-        // insert ABD. Then we'll create a new node s for AB, reduce n to just
-        // representing C, and insert a new leaf node l to represent d. This
-        // allows us to ensure that if n was a leaf, it remains a leaf.
-        //
-        //   | ABC  ---split--->  | AB
-        //   n                    s
-        //                     C / \ D
-        //                      n   l
-
-        // The node s from the diagram
-        SuffixTreeNode *SplitNode =
-            insertInternalNode(Active.Node, NextNode->StartIdx,
-                               NextNode->StartIdx + Active.Len - 1, FirstChar);
-
-        // Insert the new node representing the new substring into the tree as
-        // a child of the split node. This is the node l from the diagram.
-        insertLeaf(*SplitNode, EndIdx, LastChar);
-
-        // Make the old node a child of the split node and update its start
-        // index. This is the node n from the diagram.
-        NextNode->StartIdx += Active.Len;
-        SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
-
-        // SplitNode is an internal node, update the suffix link.
-        if (NeedsLink)
-          NeedsLink->Link = SplitNode;
-
-        NeedsLink = SplitNode;
-      }
-
-      // We've added something new to the tree, so there's one less suffix to
-      // add.
-      SuffixesToAdd--;
-
-      if (Active.Node->isRoot()) {
-        if (Active.Len > 0) {
-          Active.Len--;
-          Active.Idx = EndIdx - SuffixesToAdd + 1;
-        }
-      } else {
-        // Start the next phase at the next smallest suffix.
-        Active.Node = Active.Node->Link;
-      }
-    }
-
-    return SuffixesToAdd;
-  }
-
-public:
-  /// Construct a suffix tree from a sequence of unsigned integers.
-  ///
-  /// \param Str The string to construct the suffix tree for.
-  SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
-    Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
-    Active.Node = Root;
-
-    // Keep track of the number of suffixes we have to add of the current
-    // prefix.
-    unsigned SuffixesToAdd = 0;
-
-    // Construct the suffix tree iteratively on each prefix of the string.
-    // PfxEndIdx is the end index of the current prefix.
-    // End is one past the last element in the string.
-    for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End;
-         PfxEndIdx++) {
-      SuffixesToAdd++;
-      LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
-      SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
-    }
-
-    // Set the suffix indices of each leaf.
-    assert(Root && "Root node can't be nullptr!");
-    setSuffixIndices();
-  }
-
-  /// Iterator for finding all repeated substrings in the suffix tree.
-  struct RepeatedSubstringIterator {
-  private:
-    /// The current node we're visiting.
-    SuffixTreeNode *N = nullptr;
-
-    /// The repeated substring associated with this node.
-    RepeatedSubstring RS;
-
-    /// The nodes left to visit.
-    std::vector<SuffixTreeNode *> ToVisit;
-
-    /// The minimum length of a repeated substring to find.
-    /// Since we're outlining, we want at least two instructions in the range.
-    /// FIXME: This may not be true for targets like X86 which support many
-    /// instruction lengths.
-    const unsigned MinLength = 2;
-
-    /// Move the iterator to the next repeated substring.
-    void advance() {
-      // Clear the current state. If we're at the end of the range, then this
-      // is the state we want to be in.
-      RS = RepeatedSubstring();
-      N = nullptr;
-
-      // Each leaf node represents a repeat of a string.
-      std::vector<SuffixTreeNode *> LeafChildren;
-
-      // Continue visiting nodes until we find one which repeats more than once.
-      while (!ToVisit.empty()) {
-        SuffixTreeNode *Curr = ToVisit.back();
-        ToVisit.pop_back();
-        LeafChildren.clear();
-
-        // Keep track of the length of the string associated with the node. If
-        // it's too short, we'll quit.
-        unsigned Length = Curr->ConcatLen;
-
-        // Iterate over each child, saving internal nodes for visiting, and
-        // leaf nodes in LeafChildren. Internal nodes represent individual
-        // strings, which may repeat.
-        for (auto &ChildPair : Curr->Children) {
-          // Save all of this node's children for processing.
-          if (!ChildPair.second->isLeaf())
-            ToVisit.push_back(ChildPair.second);
-
-          // It's not an internal node, so it must be a leaf. If we have a
-          // long enough string, then save the leaf children.
-          else if (Length >= MinLength)
-            LeafChildren.push_back(ChildPair.second);
-        }
-
-        // The root never represents a repeated substring. If we're looking at
-        // that, then skip it.
-        if (Curr->isRoot())
-          continue;
-
-        // Do we have any repeated substrings?
-        if (LeafChildren.size() >= 2) {
-          // Yes. Update the state to reflect this, and then bail out.
-          N = Curr;
-          RS.Length = Length;
-          for (SuffixTreeNode *Leaf : LeafChildren)
-            RS.StartIndices.push_back(Leaf->SuffixIdx);
-          break;
-        }
-      }
-
-      // At this point, either NewRS is an empty RepeatedSubstring, or it was
-      // set in the above loop. Similarly, N is either nullptr, or the node
-      // associated with NewRS.
-    }
-
-  public:
-    /// Return the current repeated substring.
-    RepeatedSubstring &operator*() { return RS; }
-
-    RepeatedSubstringIterator &operator++() {
-      advance();
-      return *this;
-    }
-
-    RepeatedSubstringIterator operator++(int I) {
-      RepeatedSubstringIterator It(*this);
-      advance();
-      return It;
-    }
-
-    bool operator==(const RepeatedSubstringIterator &Other) {
-      return N == Other.N;
-    }
-    bool operator!=(const RepeatedSubstringIterator &Other) {
-      return !(*this == Other);
-    }
-
-    RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
-      // Do we have a non-null node?
-      if (N) {
-        // Yes. At the first step, we need to visit all of N's children.
-        // Note: This means that we visit N last.
-        ToVisit.push_back(N);
-        advance();
-      }
-    }
-  };
-
-  typedef RepeatedSubstringIterator iterator;
-  iterator begin() { return iterator(Root); }
-  iterator end() { return iterator(nullptr); }
-};
-
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
 struct InstructionMapper {
 

diff  --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 9e81e464c681..6a3448dc3f85 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -141,6 +141,7 @@ add_llvm_component_library(LLVMSupport
   StringMap.cpp
   StringSaver.cpp
   StringRef.cpp
+  SuffixTree.cpp
   SymbolRemappingReader.cpp
   SystemUtils.cpp
   TarWriter.cpp

diff  --git a/llvm/lib/Support/SuffixTree.cpp b/llvm/lib/Support/SuffixTree.cpp
new file mode 100644
index 000000000000..0d419f12cd1d
--- /dev/null
+++ b/llvm/lib/Support/SuffixTree.cpp
@@ -0,0 +1,210 @@
+//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Suffix Tree class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SuffixTree.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+using namespace llvm;
+
+SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
+  Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
+  Active.Node = Root;
+
+  // Keep track of the number of suffixes we have to add of the current
+  // prefix.
+  unsigned SuffixesToAdd = 0;
+
+  // Construct the suffix tree iteratively on each prefix of the string.
+  // PfxEndIdx is the end index of the current prefix.
+  // End is one past the last element in the string.
+  for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
+    SuffixesToAdd++;
+    LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
+    SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
+  }
+
+  // Set the suffix indices of each leaf.
+  assert(Root && "Root node can't be nullptr!");
+  setSuffixIndices();
+}
+
+SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
+                                       unsigned StartIdx, unsigned Edge) {
+
+  assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
+
+  SuffixTreeNode *N = new (NodeAllocator.Allocate())
+      SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
+  Parent.Children[Edge] = N;
+
+  return N;
+}
+
+SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
+                                               unsigned StartIdx,
+                                               unsigned EndIdx, unsigned Edge) {
+
+  assert(StartIdx <= EndIdx && "String can't start after it ends!");
+  assert(!(!Parent && StartIdx != EmptyIdx) &&
+         "Non-root internal nodes must have parents!");
+
+  unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
+  SuffixTreeNode *N =
+      new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
+  if (Parent)
+    Parent->Children[Edge] = N;
+
+  return N;
+}
+
+void SuffixTree::setSuffixIndices() {
+  // List of nodes we need to visit along with the current length of the
+  // string.
+  std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
+
+  // Current node being visited.
+  SuffixTreeNode *CurrNode = Root;
+
+  // Sum of the lengths of the nodes down the path to the current one.
+  unsigned CurrNodeLen = 0;
+  ToVisit.push_back({CurrNode, CurrNodeLen});
+  while (!ToVisit.empty()) {
+    std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
+    ToVisit.pop_back();
+    CurrNode->ConcatLen = CurrNodeLen;
+    for (auto &ChildPair : CurrNode->Children) {
+      assert(ChildPair.second && "Node had a null child!");
+      ToVisit.push_back(
+          {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
+    }
+
+    // No children, so we are at the end of the string.
+    if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
+      CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
+  }
+}
+
+unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
+  SuffixTreeNode *NeedsLink = nullptr;
+
+  while (SuffixesToAdd > 0) {
+
+    // Are we waiting to add anything other than just the last character?
+    if (Active.Len == 0) {
+      // If not, then say the active index is the end index.
+      Active.Idx = EndIdx;
+    }
+
+    assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
+
+    // The first character in the current substring we're looking at.
+    unsigned FirstChar = Str[Active.Idx];
+
+    // Have we inserted anything starting with FirstChar at the current node?
+    if (Active.Node->Children.count(FirstChar) == 0) {
+      // If not, then we can just insert a leaf and move to the next step.
+      insertLeaf(*Active.Node, EndIdx, FirstChar);
+
+      // The active node is an internal node, and we visited it, so it must
+      // need a link if it doesn't have one.
+      if (NeedsLink) {
+        NeedsLink->Link = Active.Node;
+        NeedsLink = nullptr;
+      }
+    } else {
+      // There's a match with FirstChar, so look for the point in the tree to
+      // insert a new node.
+      SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
+
+      unsigned SubstringLen = NextNode->size();
+
+      // Is the current suffix we're trying to insert longer than the size of
+      // the child we want to move to?
+      if (Active.Len >= SubstringLen) {
+        // If yes, then consume the characters we've seen and move to the next
+        // node.
+        Active.Idx += SubstringLen;
+        Active.Len -= SubstringLen;
+        Active.Node = NextNode;
+        continue;
+      }
+
+      // Otherwise, the suffix we're trying to insert must be contained in the
+      // next node we want to move to.
+      unsigned LastChar = Str[EndIdx];
+
+      // Is the string we're trying to insert a substring of the next node?
+      if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
+        // If yes, then we're done for this step. Remember our insertion point
+        // and move to the next end index. At this point, we have an implicit
+        // suffix tree.
+        if (NeedsLink && !Active.Node->isRoot()) {
+          NeedsLink->Link = Active.Node;
+          NeedsLink = nullptr;
+        }
+
+        Active.Len++;
+        break;
+      }
+
+      // The string we're trying to insert isn't a substring of the next node,
+      // but matches up to a point. Split the node.
+      //
+      // For example, say we ended our search at a node n and we're trying to
+      // insert ABD. Then we'll create a new node s for AB, reduce n to just
+      // representing C, and insert a new leaf node l to represent d. This
+      // allows us to ensure that if n was a leaf, it remains a leaf.
+      //
+      //   | ABC  ---split--->  | AB
+      //   n                    s
+      //                     C / \ D
+      //                      n   l
+
+      // The node s from the diagram
+      SuffixTreeNode *SplitNode =
+          insertInternalNode(Active.Node, NextNode->StartIdx,
+                             NextNode->StartIdx + Active.Len - 1, FirstChar);
+
+      // Insert the new node representing the new substring into the tree as
+      // a child of the split node. This is the node l from the diagram.
+      insertLeaf(*SplitNode, EndIdx, LastChar);
+
+      // Make the old node a child of the split node and update its start
+      // index. This is the node n from the diagram.
+      NextNode->StartIdx += Active.Len;
+      SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
+
+      // SplitNode is an internal node, update the suffix link.
+      if (NeedsLink)
+        NeedsLink->Link = SplitNode;
+
+      NeedsLink = SplitNode;
+    }
+
+    // We've added something new to the tree, so there's one less suffix to
+    // add.
+    SuffixesToAdd--;
+
+    if (Active.Node->isRoot()) {
+      if (Active.Len > 0) {
+        Active.Len--;
+        Active.Idx = EndIdx - SuffixesToAdd + 1;
+      }
+    } else {
+      // Start the next phase at the next smallest suffix.
+      Active.Node = Active.Node->Link;
+    }
+  }
+
+  return SuffixesToAdd;
+}

diff  --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index ce1c66d81a32..19fabfe2c74b 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -66,6 +66,7 @@ add_llvm_unittest(SupportTests
   ScaledNumberTest.cpp
   SourceMgrTest.cpp
   SpecialCaseListTest.cpp
+  SuffixTreeTest.cpp
   SwapByteOrderTest.cpp
   SymbolRemappingReaderTest.cpp
   TarWriterTest.cpp

diff  --git a/llvm/unittests/Support/SuffixTreeTest.cpp b/llvm/unittests/Support/SuffixTreeTest.cpp
new file mode 100644
index 000000000000..f5d8112ccf5c
--- /dev/null
+++ b/llvm/unittests/Support/SuffixTreeTest.cpp
@@ -0,0 +1,143 @@
+//===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SuffixTree.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+
+// Each example vector has a unique element at the end to represent the end of
+// the string
+
+// Tests that The SuffixTree finds a simple repetition of the substring {1, 2}
+// {1, 2} twice in the provided string.
+TEST(SuffixTreeTest, TestSingleRepetition) {
+  std::vector<unsigned> SimpleRepetitionData = {1, 2, 1, 2, 3};
+  SuffixTree ST(SimpleRepetitionData);
+  std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+  for (auto It = ST.begin(); It != ST.end(); It++)
+    SubStrings.push_back(*It);
+  ASSERT_EQ(SubStrings.size(), 1u);
+  EXPECT_EQ(SubStrings[0].Length, 2u);
+  EXPECT_EQ(SubStrings[0].StartIndices.size(), 2u);
+  for (unsigned StartIdx : SubStrings[0].StartIndices) {
+    EXPECT_EQ(SimpleRepetitionData[StartIdx], 1u);
+    EXPECT_EQ(SimpleRepetitionData[StartIdx + 1], 2u);
+  }
+}
+
+// Tests that the SuffixTree is able to find the substrings {1, 2, 3} at
+// at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4.
+// This test also serves as a flag for improvements to the suffix tree.
+//
+// FIXME: Right now, the longest repeated substring from a specific index is
+// returned, this could be improved to return the longest repeated substring, as
+// well as those that are smaller.
+TEST(SuffixTreeTest, TestLongerRepetition) {
+  std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3, 4};
+  SuffixTree ST(RepeatedRepetitionData);
+  std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+  for (auto It = ST.begin(); It != ST.end(); It++)
+    SubStrings.push_back(*It);
+  EXPECT_EQ(SubStrings.size(), 2u);
+  unsigned Len;
+  for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
+    Len = RS.Length;
+    bool IsExpectedLen = (Len == 3u || Len == 2u);
+    bool IsExpectedIndex;
+    ASSERT_TRUE(IsExpectedLen);
+
+    if (Len == 3u) {
+      for (unsigned StartIdx : RS.StartIndices) {
+        IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
+        EXPECT_TRUE(IsExpectedIndex);
+        EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
+        EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
+        EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
+      }
+    } else {
+      for (unsigned StartIdx : RS.StartIndices) {
+        IsExpectedIndex = (StartIdx == 1u || StartIdx == 4u);
+        EXPECT_TRUE(IsExpectedIndex);
+        EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
+        EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
+      }
+    }
+  }
+}
+
+// Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at
+// indices 0 and 1.
+//
+// FIXME: Add support for detecting {1, 1} and {1, 1, 1}
+TEST(SuffixTreeTest, TestSingleCharacterRepeat) {
+  std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
+  std::vector<unsigned>::iterator RRDIt, RRDIt2;
+  SuffixTree ST(RepeatedRepetitionData);
+  std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+  for (auto It = ST.begin(); It != ST.end(); It++)
+    SubStrings.push_back(*It);
+  EXPECT_EQ(SubStrings.size(), 1u);
+  for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
+    EXPECT_EQ(RS.StartIndices.size(),
+              RepeatedRepetitionData.size() - RS.Length);
+    for (unsigned StartIdx : SubStrings[0].StartIndices) {
+      RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
+      std::advance(RRDIt, StartIdx);
+      std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
+      ASSERT_TRUE(
+          all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
+                 [](unsigned Elt) { return Elt == 1; }));
+    }
+  }
+}
+
+// The suffix tree cannot currently find repeated substrings in strings of the
+// form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem
+// repeats")
+//
+// FIXME: Teach the SuffixTree to recognize these cases.
+TEST(SuffixTreeTest, TestTandemRepeat) {
+  std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3};
+  SuffixTree ST(RepeatedRepetitionData);
+  std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+  for (auto It = ST.begin(); It != ST.end(); It++)
+    SubStrings.push_back(*It);
+  EXPECT_EQ(SubStrings.size(), 0u);
+}
+
+// Tests that the SuffixTree does not erroneously include values that are not
+// in repeated substrings.  That is, only finds {1, 1} at indices 0 and 3 and
+// does not include 2 and 3.
+TEST(SuffixTreeTest, TestExclusion) {
+  std::vector<unsigned> RepeatedRepetitionData = {1, 1, 2, 1, 1, 3};
+  std::vector<unsigned>::iterator RRDIt, RRDIt2;
+  SuffixTree ST(RepeatedRepetitionData);
+  std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+  for (auto It = ST.begin(); It != ST.end(); It++)
+    SubStrings.push_back(*It);
+  EXPECT_EQ(SubStrings.size(), 1u);
+  bool IsExpectedIndex;
+  for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
+    for (unsigned StartIdx : RS.StartIndices) {
+      IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
+      EXPECT_TRUE(IsExpectedIndex);
+      RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
+      std::advance(RRDIt, StartIdx);
+      std::advance(RRDIt2, StartIdx + RS.Length);
+      EXPECT_TRUE(
+          all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
+                 [](unsigned Elt) { return Elt == 1; }));
+    }
+  }
+}
+
+} // namespace


        


More information about the llvm-commits mailing list