LLVM 19.0.0git
SuffixTree.h
Go to the documentation of this file.
1//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// A data structure for fast substring queries.
9//
10// Suffix trees represent the suffixes of their input strings in their leaves.
11// A suffix tree is a type of compressed trie structure where each node
12// represents an entire substring rather than a single character. Each leaf
13// of the tree is a suffix.
14//
15// A suffix tree can be seen as a type of state machine where each state is a
16// substring of the full string. The tree is structured so that, for a string
17// of length N, there are exactly N leaves in the tree. This structure allows
18// us to quickly find repeated substrings of the input string.
19//
20// In this implementation, a "string" is a vector of unsigned integers.
21// These integers may result from hashing some data type. A suffix tree can
22// contain 1 or many strings, which can then be queried as one large string.
23//
24// The suffix tree is implemented using Ukkonen's algorithm for linear-time
25// suffix tree construction. Ukkonen's algorithm is explained in more detail
26// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
27// paper is available at
28//
29// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
30//===----------------------------------------------------------------------===//
31
32#ifndef LLVM_SUPPORT_SUFFIXTREE_H
33#define LLVM_SUPPORT_SUFFIXTREE_H
34
35#include "llvm/ADT/ArrayRef.h"
38
39namespace llvm {
41public:
42 /// Each element is an integer representing an instruction in the module.
44
45 /// A repeated substring in the tree.
47 /// The length of the string.
48 unsigned Length;
49
50 /// The start indices of each occurrence.
52 };
53
54private:
55 /// Maintains internal nodes in the tree.
57 /// Maintains leaf nodes in the tree.
59
60 /// The root of the suffix tree.
61 ///
62 /// The root represents the empty string. It is maintained by the
63 /// \p NodeAllocator like every other node in the tree.
64 SuffixTreeInternalNode *Root = nullptr;
65
66 /// The end index of each leaf in the tree.
67 unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
68
69 /// Helper struct which keeps track of the next insertion point in
70 /// Ukkonen's algorithm.
71 struct ActiveState {
72 /// The next node to insert at.
74
75 /// The index of the first character in the substring currently being added.
77
78 /// The length of the substring we have to add at the current step.
79 unsigned Len = 0;
80 };
81
82 /// The point the next insertion will take place at in the
83 /// construction algorithm.
84 ActiveState Active;
85
86 /// Allocate a leaf node and add it to the tree.
87 ///
88 /// \param Parent The parent of this node.
89 /// \param StartIdx The start index of this node's associated string.
90 /// \param Edge The label on the edge leaving \p Parent to this node.
91 ///
92 /// \returns A pointer to the allocated leaf node.
93 SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx,
94 unsigned Edge);
95
96 /// Allocate an internal node and add it to the tree.
97 ///
98 /// \param Parent The parent of this node. Only null when allocating the root.
99 /// \param StartIdx The start index of this node's associated string.
100 /// \param EndIdx The end index of this node's associated string.
101 /// \param Edge The label on the edge leaving \p Parent to this node.
102 ///
103 /// \returns A pointer to the allocated internal node.
104 SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent,
105 unsigned StartIdx, unsigned EndIdx,
106 unsigned Edge);
107
108 /// Allocate the root node and add it to the tree.
109 ///
110 /// \returns A pointer to the root.
111 SuffixTreeInternalNode *insertRoot();
112
113 /// Set the suffix indices of the leaves to the start indices of their
114 /// respective suffixes.
115 void setSuffixIndices();
116
117 /// Construct the suffix tree for the prefix of the input ending at
118 /// \p EndIdx.
119 ///
120 /// Used to construct the full suffix tree iteratively. At the end of each
121 /// step, the constructed suffix tree is either a valid suffix tree, or a
122 /// suffix tree with implicit suffixes. At the end of the final step, the
123 /// suffix tree is a valid tree.
124 ///
125 /// \param EndIdx The end index of the current prefix in the main string.
126 /// \param SuffixesToAdd The number of suffixes that must be added
127 /// to complete the suffix tree at the current phase.
128 ///
129 /// \returns The number of suffixes that have not been added at the end of
130 /// this step.
131 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
132
133public:
134 /// Construct a suffix tree from a sequence of unsigned integers.
135 ///
136 /// \param Str The string to construct the suffix tree for.
138
139 /// Iterator for finding all repeated substrings in the suffix tree.
141 private:
142 /// The current node we're visiting.
143 SuffixTreeNode *N = nullptr;
144
145 /// The repeated substring associated with this node.
147
148 /// The nodes left to visit.
149 SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit;
150
151 /// The minimum length of a repeated substring to find.
152 /// Since we're outlining, we want at least two instructions in the range.
153 /// FIXME: This may not be true for targets like X86 which support many
154 /// instruction lengths.
155 const unsigned MinLength = 2;
156
157 /// Move the iterator to the next repeated substring.
158 void advance();
159
160 public:
161 /// Return the current repeated substring.
162 RepeatedSubstring &operator*() { return RS; }
163
165 advance();
166 return *this;
167 }
168
171 advance();
172 return It;
173 }
174
176 return N == Other.N;
177 }
179 return !(*this == Other);
180 }
181
183 // Do we have a non-null node?
184 if (!N)
185 return;
186 // Yes. At the first step, we need to visit all of N's children.
187 // Note: This means that we visit N last.
188 InternalNodesToVisit.push_back(N);
189 advance();
190 }
191 };
192
194 iterator begin() { return iterator(Root); }
195 iterator end() { return iterator(nullptr); }
196};
197
198} // namespace llvm
199
200#endif // LLVM_SUPPORT_SUFFIXTREE_H
This file defines the BumpPtrAllocator interface.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define I(x, y, z)
Definition: MD5.cpp:58
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:382
iterator begin()
Definition: SuffixTree.h:194
iterator end()
Definition: SuffixTree.h:195
ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition: SuffixTree.h:43
RepeatedSubstringIterator iterator
Definition: SuffixTree.h:193
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Other
Any other memory.
#define N
A node in a suffix tree which represents a substring or suffix.
static const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
Iterator for finding all repeated substrings in the suffix tree.
Definition: SuffixTree.h:140
RepeatedSubstringIterator(SuffixTreeInternalNode *N)
Definition: SuffixTree.h:182
RepeatedSubstringIterator operator++(int I)
Definition: SuffixTree.h:169
bool operator!=(const RepeatedSubstringIterator &Other) const
Definition: SuffixTree.h:178
RepeatedSubstring & operator*()
Return the current repeated substring.
Definition: SuffixTree.h:162
bool operator==(const RepeatedSubstringIterator &Other) const
Definition: SuffixTree.h:175
RepeatedSubstringIterator & operator++()
Definition: SuffixTree.h:164
A repeated substring in the tree.
Definition: SuffixTree.h:46
SmallVector< unsigned > StartIndices
The start indices of each occurrence.
Definition: SuffixTree.h:51
unsigned Length
The length of the string.
Definition: SuffixTree.h:48