LLVM 17.0.0git
SuffixTree.cpp
Go to the documentation of this file.
1//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Suffix Tree class.
10//
11//===----------------------------------------------------------------------===//
12
15#include <vector>
16
17using namespace llvm;
18
20 Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
21 Active.Node = Root;
22
23 // Keep track of the number of suffixes we have to add of the current
24 // prefix.
25 unsigned SuffixesToAdd = 0;
26
27 // Construct the suffix tree iteratively on each prefix of the string.
28 // PfxEndIdx is the end index of the current prefix.
29 // End is one past the last element in the string.
30 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
31 SuffixesToAdd++;
32 LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
33 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
34 }
35
36 // Set the suffix indices of each leaf.
37 assert(Root && "Root node can't be nullptr!");
38 setSuffixIndices();
39}
40
41SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
42 unsigned StartIdx, unsigned Edge) {
43
44 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
45
46 SuffixTreeNode *N = new (NodeAllocator.Allocate())
47 SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
48 Parent.Children[Edge] = N;
49
50 return N;
51}
52
53SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
54 unsigned StartIdx,
55 unsigned EndIdx, unsigned Edge) {
56
57 assert(StartIdx <= EndIdx && "String can't start after it ends!");
58 assert(!(!Parent && StartIdx != EmptyIdx) &&
59 "Non-root internal nodes must have parents!");
60
61 unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
63 new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
64 if (Parent)
65 Parent->Children[Edge] = N;
66
67 return N;
68}
69
70void SuffixTree::setSuffixIndices() {
71 // List of nodes we need to visit along with the current length of the
72 // string.
74
75 // Current node being visited.
76 SuffixTreeNode *CurrNode = Root;
77
78 // Sum of the lengths of the nodes down the path to the current one.
79 unsigned CurrNodeLen = 0;
80 ToVisit.push_back({CurrNode, CurrNodeLen});
81 while (!ToVisit.empty()) {
82 std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
83 ToVisit.pop_back();
84 CurrNode->ConcatLen = CurrNodeLen;
85 for (auto &ChildPair : CurrNode->Children) {
86 assert(ChildPair.second && "Node had a null child!");
87 ToVisit.push_back(
88 {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
89 }
90
91 // No children, so we are at the end of the string.
92 if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
93 CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
94 }
95}
96
97unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
98 SuffixTreeNode *NeedsLink = nullptr;
99
100 while (SuffixesToAdd > 0) {
101
102 // Are we waiting to add anything other than just the last character?
103 if (Active.Len == 0) {
104 // If not, then say the active index is the end index.
105 Active.Idx = EndIdx;
106 }
107
108 assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
109
110 // The first character in the current substring we're looking at.
111 unsigned FirstChar = Str[Active.Idx];
112
113 // Have we inserted anything starting with FirstChar at the current node?
114 if (Active.Node->Children.count(FirstChar) == 0) {
115 // If not, then we can just insert a leaf and move to the next step.
116 insertLeaf(*Active.Node, EndIdx, FirstChar);
117
118 // The active node is an internal node, and we visited it, so it must
119 // need a link if it doesn't have one.
120 if (NeedsLink) {
121 NeedsLink->Link = Active.Node;
122 NeedsLink = nullptr;
123 }
124 } else {
125 // There's a match with FirstChar, so look for the point in the tree to
126 // insert a new node.
127 SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
128
129 unsigned SubstringLen = NextNode->size();
130
131 // Is the current suffix we're trying to insert longer than the size of
132 // the child we want to move to?
133 if (Active.Len >= SubstringLen) {
134 // If yes, then consume the characters we've seen and move to the next
135 // node.
136 Active.Idx += SubstringLen;
137 Active.Len -= SubstringLen;
138 Active.Node = NextNode;
139 continue;
140 }
141
142 // Otherwise, the suffix we're trying to insert must be contained in the
143 // next node we want to move to.
144 unsigned LastChar = Str[EndIdx];
145
146 // Is the string we're trying to insert a substring of the next node?
147 if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
148 // If yes, then we're done for this step. Remember our insertion point
149 // and move to the next end index. At this point, we have an implicit
150 // suffix tree.
151 if (NeedsLink && !Active.Node->isRoot()) {
152 NeedsLink->Link = Active.Node;
153 NeedsLink = nullptr;
154 }
155
156 Active.Len++;
157 break;
158 }
159
160 // The string we're trying to insert isn't a substring of the next node,
161 // but matches up to a point. Split the node.
162 //
163 // For example, say we ended our search at a node n and we're trying to
164 // insert ABD. Then we'll create a new node s for AB, reduce n to just
165 // representing C, and insert a new leaf node l to represent d. This
166 // allows us to ensure that if n was a leaf, it remains a leaf.
167 //
168 // | ABC ---split---> | AB
169 // n s
170 // C / \ D
171 // n l
172
173 // The node s from the diagram
174 SuffixTreeNode *SplitNode =
175 insertInternalNode(Active.Node, NextNode->StartIdx,
176 NextNode->StartIdx + Active.Len - 1, FirstChar);
177
178 // Insert the new node representing the new substring into the tree as
179 // a child of the split node. This is the node l from the diagram.
180 insertLeaf(*SplitNode, EndIdx, LastChar);
181
182 // Make the old node a child of the split node and update its start
183 // index. This is the node n from the diagram.
184 NextNode->StartIdx += Active.Len;
185 SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
186
187 // SplitNode is an internal node, update the suffix link.
188 if (NeedsLink)
189 NeedsLink->Link = SplitNode;
190
191 NeedsLink = SplitNode;
192 }
193
194 // We've added something new to the tree, so there's one less suffix to
195 // add.
196 SuffixesToAdd--;
197
198 if (Active.Node->isRoot()) {
199 if (Active.Len > 0) {
200 Active.Len--;
201 Active.Idx = EndIdx - SuffixesToAdd + 1;
202 }
203 } else {
204 // Start the next phase at the next smallest suffix.
205 Active.Node = Active.Node->Link;
206 }
207 }
208
209 return SuffixesToAdd;
210}
This file defines the BumpPtrAllocator interface.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
bool empty() const
Definition: SmallVector.h:94
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
SuffixTree(const ArrayRef< unsigned > &Str)
Construct a suffix tree from a sequence of unsigned integers.
Definition: SuffixTree.cpp:19
ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition: SuffixTree.h:140
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
Definition: SuffixTree.h:23
#define N
A node in a suffix tree which represents a substring or suffix.
Definition: SuffixTree.h:40
unsigned SuffixIdx
For leaves, the start index of the suffix represented by this node.
Definition: SuffixTree.h:63
size_t size() const
Return the number of elements in the substring associated with this node.
Definition: SuffixTree.h:96
unsigned ConcatLen
The length of the string formed by concatenating the edge labels from the root to this node.
Definition: SuffixTree.h:87
SuffixTreeNode * Link
For internal nodes, a pointer to the internal node representing the same sequence with the first char...
Definition: SuffixTree.h:83
DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
Definition: SuffixTree.h:47
unsigned StartIdx
The start index of this node's substring in the main string.
Definition: SuffixTree.h:50
bool isRoot() const
Returns true if this node is the root of its owning SuffixTree.
Definition: SuffixTree.h:93