LLVM 20.0.0git
SuffixTree.cpp
Go to the documentation of this file.
1//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Suffix Tree class.
10//
11//===----------------------------------------------------------------------===//
12
17
18using namespace llvm;
19
20/// \returns the number of elements in the substring associated with \p N.
22 assert(N && "Got a null node?");
23 if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
24 if (Internal->isRoot())
25 return 0;
26 return N->getEndIdx() - N->getStartIdx() + 1;
27}
28
32 Root = insertRoot();
33 Active.Node = Root;
34
35 // Keep track of the number of suffixes we have to add of the current
36 // prefix.
37 unsigned SuffixesToAdd = 0;
38
39 // Construct the suffix tree iteratively on each prefix of the string.
40 // PfxEndIdx is the end index of the current prefix.
41 // End is one past the last element in the string.
42 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
43 SuffixesToAdd++;
44 LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
45 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
46 }
47
48 // Set the suffix indices of each leaf.
49 assert(Root && "Root node can't be nullptr!");
50 setSuffixIndices();
51
52 // Collect all leaf nodes of the suffix tree. And for each internal node,
53 // record the range of leaf nodes that are descendants of it.
55 setLeafNodes();
56}
57
58SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
59 unsigned StartIdx, unsigned Edge) {
60 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
61 auto *N = new (LeafNodeAllocator.Allocate())
62 SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
63 Parent.Children[Edge] = N;
64 return N;
65}
66
68SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
69 unsigned StartIdx, unsigned EndIdx,
70 unsigned Edge) {
71 assert(StartIdx <= EndIdx && "String can't start after it ends!");
72 assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) &&
73 "Non-root internal nodes must have parents!");
74 auto *N = new (InternalNodeAllocator.Allocate())
75 SuffixTreeInternalNode(StartIdx, EndIdx, Root);
76 if (Parent)
77 Parent->Children[Edge] = N;
78 return N;
79}
80
81SuffixTreeInternalNode *SuffixTree::insertRoot() {
82 return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx,
83 SuffixTreeNode::EmptyIdx, /*Edge = */ 0);
84}
85
86void SuffixTree::setSuffixIndices() {
87 // List of nodes we need to visit along with the current length of the
88 // string.
90
91 // Current node being visited.
92 SuffixTreeNode *CurrNode = Root;
93
94 // Sum of the lengths of the nodes down the path to the current one.
95 unsigned CurrNodeLen = 0;
96 ToVisit.push_back({CurrNode, CurrNodeLen});
97 while (!ToVisit.empty()) {
98 std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
99 ToVisit.pop_back();
100 // Length of the current node from the root down to here.
101 CurrNode->setConcatLen(CurrNodeLen);
102 if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
103 for (auto &ChildPair : InternalNode->Children) {
104 assert(ChildPair.second && "Node had a null child!");
105 ToVisit.push_back(
106 {ChildPair.second,
107 CurrNodeLen + numElementsInSubstring(ChildPair.second)});
108 }
109 // No children, so we are at the end of the string.
110 if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
111 LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
112 }
113}
114
115void SuffixTree::setLeafNodes() {
116 // A stack that keeps track of nodes to visit for post-order DFS traversal.
118 ToVisit.push_back(Root);
119
120 // This keeps track of the index of the next leaf node to be added to
121 // the LeafNodes vector of the suffix tree.
122 unsigned LeafCounter = 0;
123
124 // This keeps track of nodes whose children have been added to the stack.
125 // The value is a pair, representing a node's first and last children.
127 std::pair<SuffixTreeNode *, SuffixTreeNode *>>
128 ChildrenMap;
129
130 // Traverse the tree in post-order.
131 while (!ToVisit.empty()) {
132 SuffixTreeNode *CurrNode = ToVisit.pop_back_val();
133 if (auto *CurrInternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) {
134 // The current node is an internal node.
135 auto I = ChildrenMap.find(CurrInternalNode);
136 if (I == ChildrenMap.end()) {
137 // This is the first time we visit this node.
138 // Its children have not been added to the stack yet.
139 // We add current node back, and add its children to the stack.
140 // We keep track of the first and last children of the current node.
141 auto J = CurrInternalNode->Children.begin();
142 if (J != CurrInternalNode->Children.end()) {
143 ToVisit.push_back(CurrNode);
144 SuffixTreeNode *FirstChild = J->second;
145 SuffixTreeNode *LastChild = nullptr;
146 for (; J != CurrInternalNode->Children.end(); ++J) {
147 LastChild = J->second;
148 ToVisit.push_back(LastChild);
149 }
150 ChildrenMap[CurrInternalNode] = {FirstChild, LastChild};
151 }
152 } else {
153 // This is the second time we visit this node.
154 // All of its children have already been processed.
155 // Now, we can set its LeftLeafIdx and RightLeafIdx;
156 auto [FirstChild, LastChild] = I->second;
157 // Get the first child to use its RightLeafIdx.
158 // The first child is the first one added to the stack, so it is
159 // the last one to be processed. Hence, the leaf descendants
160 // of the first child are assigned the largest index numbers.
161 CurrNode->setRightLeafIdx(FirstChild->getRightLeafIdx());
162 // Get the last child to use its LeftLeafIdx.
163 CurrNode->setLeftLeafIdx(LastChild->getLeftLeafIdx());
164 assert(CurrNode->getLeftLeafIdx() <= CurrNode->getRightLeafIdx() &&
165 "LeftLeafIdx should not be larger than RightLeafIdx");
166 }
167 } else {
168 // The current node is a leaf node.
169 // We can simply set its LeftLeafIdx and RightLeafIdx.
170 CurrNode->setLeftLeafIdx(LeafCounter);
171 CurrNode->setRightLeafIdx(LeafCounter);
172 ++LeafCounter;
173 auto *CurrLeafNode = cast<SuffixTreeLeafNode>(CurrNode);
174 LeafNodes.push_back(CurrLeafNode);
175 }
176 }
177}
178
179unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
180 SuffixTreeInternalNode *NeedsLink = nullptr;
181
182 while (SuffixesToAdd > 0) {
183
184 // Are we waiting to add anything other than just the last character?
185 if (Active.Len == 0) {
186 // If not, then say the active index is the end index.
187 Active.Idx = EndIdx;
188 }
189
190 assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
191
192 // The first character in the current substring we're looking at.
193 unsigned FirstChar = Str[Active.Idx];
194
195 // Have we inserted anything starting with FirstChar at the current node?
196 if (Active.Node->Children.count(FirstChar) == 0) {
197 // If not, then we can just insert a leaf and move to the next step.
198 insertLeaf(*Active.Node, EndIdx, FirstChar);
199
200 // The active node is an internal node, and we visited it, so it must
201 // need a link if it doesn't have one.
202 if (NeedsLink) {
203 NeedsLink->setLink(Active.Node);
204 NeedsLink = nullptr;
205 }
206 } else {
207 // There's a match with FirstChar, so look for the point in the tree to
208 // insert a new node.
209 SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
210
211 unsigned SubstringLen = numElementsInSubstring(NextNode);
212
213 // Is the current suffix we're trying to insert longer than the size of
214 // the child we want to move to?
215 if (Active.Len >= SubstringLen) {
216 // If yes, then consume the characters we've seen and move to the next
217 // node.
218 assert(isa<SuffixTreeInternalNode>(NextNode) &&
219 "Expected an internal node?");
220 Active.Idx += SubstringLen;
221 Active.Len -= SubstringLen;
222 Active.Node = cast<SuffixTreeInternalNode>(NextNode);
223 continue;
224 }
225
226 // Otherwise, the suffix we're trying to insert must be contained in the
227 // next node we want to move to.
228 unsigned LastChar = Str[EndIdx];
229
230 // Is the string we're trying to insert a substring of the next node?
231 if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
232 // If yes, then we're done for this step. Remember our insertion point
233 // and move to the next end index. At this point, we have an implicit
234 // suffix tree.
235 if (NeedsLink && !Active.Node->isRoot()) {
236 NeedsLink->setLink(Active.Node);
237 NeedsLink = nullptr;
238 }
239
240 Active.Len++;
241 break;
242 }
243
244 // The string we're trying to insert isn't a substring of the next node,
245 // but matches up to a point. Split the node.
246 //
247 // For example, say we ended our search at a node n and we're trying to
248 // insert ABD. Then we'll create a new node s for AB, reduce n to just
249 // representing C, and insert a new leaf node l to represent d. This
250 // allows us to ensure that if n was a leaf, it remains a leaf.
251 //
252 // | ABC ---split---> | AB
253 // n s
254 // C / \ D
255 // n l
256
257 // The node s from the diagram
258 SuffixTreeInternalNode *SplitNode = insertInternalNode(
259 Active.Node, NextNode->getStartIdx(),
260 NextNode->getStartIdx() + Active.Len - 1, FirstChar);
261
262 // Insert the new node representing the new substring into the tree as
263 // a child of the split node. This is the node l from the diagram.
264 insertLeaf(*SplitNode, EndIdx, LastChar);
265
266 // Make the old node a child of the split node and update its start
267 // index. This is the node n from the diagram.
268 NextNode->incrementStartIdx(Active.Len);
269 SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
270
271 // SplitNode is an internal node, update the suffix link.
272 if (NeedsLink)
273 NeedsLink->setLink(SplitNode);
274
275 NeedsLink = SplitNode;
276 }
277
278 // We've added something new to the tree, so there's one less suffix to
279 // add.
280 SuffixesToAdd--;
281
282 if (Active.Node->isRoot()) {
283 if (Active.Len > 0) {
284 Active.Len--;
285 Active.Idx = EndIdx - SuffixesToAdd + 1;
286 }
287 } else {
288 // Start the next phase at the next smallest suffix.
289 Active.Node = Active.Node->getLink();
290 }
291 }
292
293 return SuffixesToAdd;
294}
295
296void SuffixTree::RepeatedSubstringIterator::advance() {
297 // Clear the current state. If we're at the end of the range, then this
298 // is the state we want to be in.
299 RS = RepeatedSubstring();
300 N = nullptr;
301
302 // Each leaf node represents a repeat of a string.
303 SmallVector<unsigned> RepeatedSubstringStarts;
304
305 // Continue visiting nodes until we find one which repeats more than once.
306 while (!InternalNodesToVisit.empty()) {
307 RepeatedSubstringStarts.clear();
308 auto *Curr = InternalNodesToVisit.back();
309 InternalNodesToVisit.pop_back();
310
311 // Keep track of the length of the string associated with the node. If
312 // it's too short, we'll quit.
313 unsigned Length = Curr->getConcatLen();
314
315 // Iterate over each child, saving internal nodes for visiting.
316 // Internal nodes represent individual strings, which may repeat.
317 for (auto &ChildPair : Curr->Children)
318 // Save all of this node's children for processing.
319 if (auto *InternalChild =
320 dyn_cast<SuffixTreeInternalNode>(ChildPair.second))
321 InternalNodesToVisit.push_back(InternalChild);
322
323 // If length of repeated substring is below threshold, then skip it.
324 if (Length < MinLength)
325 continue;
326
327 // The root never represents a repeated substring. If we're looking at
328 // that, then skip it.
329 if (Curr->isRoot())
330 continue;
331
332 // Collect leaf children or leaf descendants by OutlinerLeafDescendants.
333 if (OutlinerLeafDescendants) {
334 for (unsigned I = Curr->getLeftLeafIdx(); I <= Curr->getRightLeafIdx();
335 ++I)
336 RepeatedSubstringStarts.push_back(LeafNodes[I]->getSuffixIdx());
337 } else {
338 for (auto &ChildPair : Curr->Children)
339 if (auto *Leaf = dyn_cast<SuffixTreeLeafNode>(ChildPair.second))
340 RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
341 }
342
343 // Do we have any repeated substrings?
344 if (RepeatedSubstringStarts.size() < 2)
345 continue;
346
347 // Yes. Update the state to reflect this, and then bail out.
348 N = Curr;
349 RS.Length = Length;
350 for (unsigned StartIdx : RepeatedSubstringStarts)
351 RS.StartIndices.push_back(StartIdx);
352 break;
353 }
354 // At this point, either NewRS is an empty RepeatedSubstring, or it was
355 // set in the above loop. Similarly, N is either nullptr, or the node
356 // associated with NewRS.
357}
This file defines the BumpPtrAllocator interface.
bool End
Definition: ELF_riscv.cpp:480
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< bool > OutlinerLeafDescendants("outliner-leaf-descendants", cl::init(true), cl::Hidden, cl::desc("Consider all leaf descendants of internal nodes of the suffix " "tree as candidates for outlining (if false, only leaf children " "are considered)"))
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static size_t numElementsInSubstring(const SuffixTreeNode *N)
Definition: SuffixTree.cpp:21
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
SuffixTree(const ArrayRef< unsigned > &Str, bool OutlinerLeafDescendants=false)
Construct a suffix tree from a sequence of unsigned integers.
Definition: SuffixTree.cpp:29
ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition: SuffixTree.h:43
bool OutlinerLeafDescendants
Whether to consider leaf descendants or only leaf children.
Definition: SuffixTree.h:46
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:480
#define N
DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
void setLink(SuffixTreeInternalNode *L)
Sets Link to L. Assumes L is not null.
A node in a suffix tree which represents a substring or suffix.
void incrementStartIdx(unsigned Inc)
Advance this node's StartIdx by Inc.
void setConcatLen(unsigned Len)
Set the length of the string from the root to this node to Len.
unsigned getLeftLeafIdx() const
unsigned getRightLeafIdx() const
void setRightLeafIdx(unsigned Idx)
Set the index of the right most leaf node of this node to Idx.
static const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
unsigned getStartIdx() const
void setLeftLeafIdx(unsigned Idx)
Set the index of the left most leaf node of this node to Idx.
SmallVector< unsigned > StartIndices
The start indices of each occurrence.
Definition: SuffixTree.h:54
unsigned Length
The length of the string.
Definition: SuffixTree.h:51