LLVM 17.0.0git
SuffixTree.h
Go to the documentation of this file.
1//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the Suffix Tree class and Suffix Tree Node struct.
10//
11//===----------------------------------------------------------------------===//
12#ifndef LLVM_SUPPORT_SUFFIXTREE_H
13#define LLVM_SUPPORT_SUFFIXTREE_H
14
15#include "llvm/ADT/ArrayRef.h"
16#include "llvm/ADT/DenseMap.h"
18#include <vector>
19
20namespace llvm {
21
22/// Represents an undefined index in the suffix tree.
23const unsigned EmptyIdx = -1;
24
25/// A node in a suffix tree which represents a substring or suffix.
26///
27/// Each node has either no children or at least two children, with the root
28/// being a exception in the empty tree.
29///
30/// Children are represented as a map between unsigned integers and nodes. If
31/// a node N has a child M on unsigned integer k, then the mapping represented
32/// by N is a proper prefix of the mapping represented by M. Note that this,
33/// although similar to a trie is somewhat different: each node stores a full
34/// substring of the full mapping rather than a single character state.
35///
36/// Each internal node contains a pointer to the internal node representing
37/// the same string, but with the first character chopped off. This is stored
38/// in \p Link. Each leaf node stores the start index of its respective
39/// suffix in \p SuffixIdx.
41
42 /// The children of this node.
43 ///
44 /// A child existing on an unsigned integer implies that from the mapping
45 /// represented by the current node, there is a way to reach another
46 /// mapping by tacking that character on the end of the current string.
48
49 /// The start index of this node's substring in the main string.
50 unsigned StartIdx = EmptyIdx;
51
52 /// The end index of this node's substring in the main string.
53 ///
54 /// Every leaf node must have its \p EndIdx incremented at the end of every
55 /// step in the construction algorithm. To avoid having to update O(N)
56 /// nodes individually at the end of every step, the end index is stored
57 /// as a pointer.
58 unsigned *EndIdx = nullptr;
59
60 /// For leaves, the start index of the suffix represented by this node.
61 ///
62 /// For all other nodes, this is ignored.
63 unsigned SuffixIdx = EmptyIdx;
64
65 /// For internal nodes, a pointer to the internal node representing
66 /// the same sequence with the first character chopped off.
67 ///
68 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
69 /// Ukkonen's algorithm does to achieve linear-time construction is
70 /// keep track of which node the next insert should be at. This makes each
71 /// insert O(1), and there are a total of O(N) inserts. The suffix link
72 /// helps with inserting children of internal nodes.
73 ///
74 /// Say we add a child to an internal node with associated mapping S. The
75 /// next insertion must be at the node representing S - its first character.
76 /// This is given by the way that we iteratively build the tree in Ukkonen's
77 /// algorithm. The main idea is to look at the suffixes of each prefix in the
78 /// string, starting with the longest suffix of the prefix, and ending with
79 /// the shortest. Therefore, if we keep pointers between such nodes, we can
80 /// move to the next insertion point in O(1) time. If we don't, then we'd
81 /// have to query from the root, which takes O(N) time. This would make the
82 /// construction algorithm O(N^2) rather than O(N).
83 SuffixTreeNode *Link = nullptr;
84
85 /// The length of the string formed by concatenating the edge labels from the
86 /// root to this node.
87 unsigned ConcatLen = 0;
88
89 /// Returns true if this node is a leaf.
90 bool isLeaf() const { return SuffixIdx != EmptyIdx; }
91
92 /// Returns true if this node is the root of its owning \p SuffixTree.
93 bool isRoot() const { return StartIdx == EmptyIdx; }
94
95 /// Return the number of elements in the substring associated with this node.
96 size_t size() const {
97
98 // Is it the root? If so, it's the empty string so return 0.
99 if (isRoot())
100 return 0;
101
102 assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
103
104 // Size = the number of elements in the string.
105 // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
106 return *EndIdx - StartIdx + 1;
107 }
108
111
112 SuffixTreeNode() = default;
113};
114
115/// A data structure for fast substring queries.
116///
117/// Suffix trees represent the suffixes of their input strings in their leaves.
118/// A suffix tree is a type of compressed trie structure where each node
119/// represents an entire substring rather than a single character. Each leaf
120/// of the tree is a suffix.
121///
122/// A suffix tree can be seen as a type of state machine where each state is a
123/// substring of the full string. The tree is structured so that, for a string
124/// of length N, there are exactly N leaves in the tree. This structure allows
125/// us to quickly find repeated substrings of the input string.
126///
127/// In this implementation, a "string" is a vector of unsigned integers.
128/// These integers may result from hashing some data type. A suffix tree can
129/// contain 1 or many strings, which can then be queried as one large string.
130///
131/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
132/// suffix tree construction. Ukkonen's algorithm is explained in more detail
133/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
134/// paper is available at
135///
136/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
138public:
139 /// Each element is an integer representing an instruction in the module.
141
142 /// A repeated substring in the tree.
144 /// The length of the string.
145 unsigned Length;
146
147 /// The start indices of each occurrence.
149 };
150
151private:
152 /// Maintains each node in the tree.
154
155 /// The root of the suffix tree.
156 ///
157 /// The root represents the empty string. It is maintained by the
158 /// \p NodeAllocator like every other node in the tree.
159 SuffixTreeNode *Root = nullptr;
160
161 /// Maintains the end indices of the internal nodes in the tree.
162 ///
163 /// Each internal node is guaranteed to never have its end index change
164 /// during the construction algorithm; however, leaves must be updated at
165 /// every step. Therefore, we need to store leaf end indices by reference
166 /// to avoid updating O(N) leaves at every step of construction. Thus,
167 /// every internal node must be allocated its own end index.
168 BumpPtrAllocator InternalEndIdxAllocator;
169
170 /// The end index of each leaf in the tree.
171 unsigned LeafEndIdx = -1;
172
173 /// Helper struct which keeps track of the next insertion point in
174 /// Ukkonen's algorithm.
175 struct ActiveState {
176 /// The next node to insert at.
177 SuffixTreeNode *Node = nullptr;
178
179 /// The index of the first character in the substring currently being added.
180 unsigned Idx = EmptyIdx;
181
182 /// The length of the substring we have to add at the current step.
183 unsigned Len = 0;
184 };
185
186 /// The point the next insertion will take place at in the
187 /// construction algorithm.
188 ActiveState Active;
189
190 /// Allocate a leaf node and add it to the tree.
191 ///
192 /// \param Parent The parent of this node.
193 /// \param StartIdx The start index of this node's associated string.
194 /// \param Edge The label on the edge leaving \p Parent to this node.
195 ///
196 /// \returns A pointer to the allocated leaf node.
197 SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
198 unsigned Edge);
199
200 /// Allocate an internal node and add it to the tree.
201 ///
202 /// \param Parent The parent of this node. Only null when allocating the root.
203 /// \param StartIdx The start index of this node's associated string.
204 /// \param EndIdx The end index of this node's associated string.
205 /// \param Edge The label on the edge leaving \p Parent to this node.
206 ///
207 /// \returns A pointer to the allocated internal node.
208 SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
209 unsigned EndIdx, unsigned Edge);
210
211 /// Set the suffix indices of the leaves to the start indices of their
212 /// respective suffixes.
213 void setSuffixIndices();
214
215 /// Construct the suffix tree for the prefix of the input ending at
216 /// \p EndIdx.
217 ///
218 /// Used to construct the full suffix tree iteratively. At the end of each
219 /// step, the constructed suffix tree is either a valid suffix tree, or a
220 /// suffix tree with implicit suffixes. At the end of the final step, the
221 /// suffix tree is a valid tree.
222 ///
223 /// \param EndIdx The end index of the current prefix in the main string.
224 /// \param SuffixesToAdd The number of suffixes that must be added
225 /// to complete the suffix tree at the current phase.
226 ///
227 /// \returns The number of suffixes that have not been added at the end of
228 /// this step.
229 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
230
231public:
232 /// Construct a suffix tree from a sequence of unsigned integers.
233 ///
234 /// \param Str The string to construct the suffix tree for.
236
237 /// Iterator for finding all repeated substrings in the suffix tree.
239 private:
240 /// The current node we're visiting.
241 SuffixTreeNode *N = nullptr;
242
243 /// The repeated substring associated with this node.
245
246 /// The nodes left to visit.
248
249 /// The minimum length of a repeated substring to find.
250 /// Since we're outlining, we want at least two instructions in the range.
251 /// FIXME: This may not be true for targets like X86 which support many
252 /// instruction lengths.
253 const unsigned MinLength = 2;
254
255 /// Move the iterator to the next repeated substring.
256 void advance() {
257 // Clear the current state. If we're at the end of the range, then this
258 // is the state we want to be in.
259 RS = RepeatedSubstring();
260 N = nullptr;
261
262 // Each leaf node represents a repeat of a string.
264
265 // Continue visiting nodes until we find one which repeats more than once.
266 while (!ToVisit.empty()) {
267 SuffixTreeNode *Curr = ToVisit.back();
268 ToVisit.pop_back();
269 LeafChildren.clear();
270
271 // Keep track of the length of the string associated with the node. If
272 // it's too short, we'll quit.
273 unsigned Length = Curr->ConcatLen;
274
275 // Iterate over each child, saving internal nodes for visiting, and
276 // leaf nodes in LeafChildren. Internal nodes represent individual
277 // strings, which may repeat.
278 for (auto &ChildPair : Curr->Children) {
279 // Save all of this node's children for processing.
280 if (!ChildPair.second->isLeaf())
281 ToVisit.push_back(ChildPair.second);
282
283 // It's not an internal node, so it must be a leaf. If we have a
284 // long enough string, then save the leaf children.
285 else if (Length >= MinLength)
286 LeafChildren.push_back(ChildPair.second);
287 }
288
289 // The root never represents a repeated substring. If we're looking at
290 // that, then skip it.
291 if (Curr->isRoot())
292 continue;
293
294 // Do we have any repeated substrings?
295 if (LeafChildren.size() >= 2) {
296 // Yes. Update the state to reflect this, and then bail out.
297 N = Curr;
298 RS.Length = Length;
299 for (SuffixTreeNode *Leaf : LeafChildren)
300 RS.StartIndices.push_back(Leaf->SuffixIdx);
301 break;
302 }
303 }
304
305 // At this point, either NewRS is an empty RepeatedSubstring, or it was
306 // set in the above loop. Similarly, N is either nullptr, or the node
307 // associated with NewRS.
308 }
309
310 public:
311 /// Return the current repeated substring.
312 RepeatedSubstring &operator*() { return RS; }
313
315 advance();
316 return *this;
317 }
318
321 advance();
322 return It;
323 }
324
326 return N == Other.N;
327 }
329 return !(*this == Other);
330 }
331
333 // Do we have a non-null node?
334 if (N) {
335 // Yes. At the first step, we need to visit all of N's children.
336 // Note: This means that we visit N last.
337 ToVisit.push_back(N);
338 advance();
339 }
340 }
341 };
342
344 iterator begin() { return iterator(Root); }
345 iterator end() { return iterator(nullptr); }
346};
347
348} // namespace llvm
349
350#endif // LLVM_SUPPORT_SUFFIXTREE_H
This file defines the BumpPtrAllocator interface.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
#define I(x, y, z)
Definition: MD5.cpp:58
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:66
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:382
A data structure for fast substring queries.
Definition: SuffixTree.h:137
iterator begin()
Definition: SuffixTree.h:344
iterator end()
Definition: SuffixTree.h:345
ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition: SuffixTree.h:140
RepeatedSubstringIterator iterator
Definition: SuffixTree.h:343
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:406
const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
Definition: SuffixTree.h:23
#define N
A node in a suffix tree which represents a substring or suffix.
Definition: SuffixTree.h:40
unsigned * EndIdx
The end index of this node's substring in the main string.
Definition: SuffixTree.h:58
bool isLeaf() const
Returns true if this node is a leaf.
Definition: SuffixTree.h:90
unsigned SuffixIdx
For leaves, the start index of the suffix represented by this node.
Definition: SuffixTree.h:63
SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
Definition: SuffixTree.h:109
size_t size() const
Return the number of elements in the substring associated with this node.
Definition: SuffixTree.h:96
SuffixTreeNode()=default
unsigned ConcatLen
The length of the string formed by concatenating the edge labels from the root to this node.
Definition: SuffixTree.h:87
SuffixTreeNode * Link
For internal nodes, a pointer to the internal node representing the same sequence with the first char...
Definition: SuffixTree.h:83
DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
Definition: SuffixTree.h:47
unsigned StartIdx
The start index of this node's substring in the main string.
Definition: SuffixTree.h:50
bool isRoot() const
Returns true if this node is the root of its owning SuffixTree.
Definition: SuffixTree.h:93
Iterator for finding all repeated substrings in the suffix tree.
Definition: SuffixTree.h:238
RepeatedSubstringIterator operator++(int I)
Definition: SuffixTree.h:319
bool operator!=(const RepeatedSubstringIterator &Other) const
Definition: SuffixTree.h:328
RepeatedSubstring & operator*()
Return the current repeated substring.
Definition: SuffixTree.h:312
bool operator==(const RepeatedSubstringIterator &Other) const
Definition: SuffixTree.h:325
RepeatedSubstringIterator & operator++()
Definition: SuffixTree.h:314
A repeated substring in the tree.
Definition: SuffixTree.h:143
SmallVector< unsigned > StartIndices
The start indices of each occurrence.
Definition: SuffixTree.h:148
unsigned Length
The length of the string.
Definition: SuffixTree.h:145