clang  3.9.0
CommentLexer.h
Go to the documentation of this file.
1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines lexer for structured comments and supporting token class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
15 #define LLVM_CLANG_AST_COMMENTLEXER_H
16 
17 #include "clang/Basic/Diagnostic.h"
19 #include "llvm/ADT/SmallString.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/Support/Allocator.h"
23 #include "llvm/Support/raw_ostream.h"
24 
25 namespace clang {
26 namespace comments {
27 
28 class Lexer;
29 class TextTokenRetokenizer;
30 struct CommandInfo;
31 class CommandTraits;
32 
33 namespace tok {
34 enum TokenKind {
35  eof,
38  unknown_command, // Command that does not have an ID.
39  backslash_command, // Command with an ID, that used backslash marker.
40  at_command, // Command with an ID, that used 'at' marker.
46  html_start_tag, // <tag
47  html_ident, // attr
49  html_quoted_string, // "blah\"blah" or 'blah\'blah'
52  html_end_tag // </tag
53 };
54 } // end namespace tok
55 
56 /// \brief Comment token.
57 class Token {
58  friend class Lexer;
59  friend class TextTokenRetokenizer;
60 
61  /// The location of the token.
62  SourceLocation Loc;
63 
64  /// The actual kind of the token.
66 
67  /// Length of the token spelling in comment. Can be 0 for synthenized
68  /// tokens.
69  unsigned Length;
70 
71  /// Contains text value associated with a token.
72  const char *TextPtr;
73 
74  /// Integer value associated with a token.
75  ///
76  /// If the token is a konwn command, contains command ID and TextPtr is
77  /// unused (command spelling can be found with CommandTraits). Otherwise,
78  /// contains the length of the string that starts at TextPtr.
79  unsigned IntVal;
80 
81 public:
82  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
83  void setLocation(SourceLocation SL) { Loc = SL; }
84 
85  SourceLocation getEndLocation() const LLVM_READONLY {
86  if (Length == 0 || Length == 1)
87  return Loc;
88  return Loc.getLocWithOffset(Length - 1);
89  }
90 
91  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
92  void setKind(tok::TokenKind K) { Kind = K; }
93 
94  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
95  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
96 
97  unsigned getLength() const LLVM_READONLY { return Length; }
98  void setLength(unsigned L) { Length = L; }
99 
100  StringRef getText() const LLVM_READONLY {
101  assert(is(tok::text));
102  return StringRef(TextPtr, IntVal);
103  }
104 
105  void setText(StringRef Text) {
106  assert(is(tok::text));
107  TextPtr = Text.data();
108  IntVal = Text.size();
109  }
110 
111  StringRef getUnknownCommandName() const LLVM_READONLY {
112  assert(is(tok::unknown_command));
113  return StringRef(TextPtr, IntVal);
114  }
115 
116  void setUnknownCommandName(StringRef Name) {
117  assert(is(tok::unknown_command));
118  TextPtr = Name.data();
119  IntVal = Name.size();
120  }
121 
122  unsigned getCommandID() const LLVM_READONLY {
124  return IntVal;
125  }
126 
127  void setCommandID(unsigned ID) {
129  IntVal = ID;
130  }
131 
132  unsigned getVerbatimBlockID() const LLVM_READONLY {
134  return IntVal;
135  }
136 
137  void setVerbatimBlockID(unsigned ID) {
139  IntVal = ID;
140  }
141 
142  StringRef getVerbatimBlockText() const LLVM_READONLY {
143  assert(is(tok::verbatim_block_line));
144  return StringRef(TextPtr, IntVal);
145  }
146 
147  void setVerbatimBlockText(StringRef Text) {
148  assert(is(tok::verbatim_block_line));
149  TextPtr = Text.data();
150  IntVal = Text.size();
151  }
152 
153  unsigned getVerbatimLineID() const LLVM_READONLY {
154  assert(is(tok::verbatim_line_name));
155  return IntVal;
156  }
157 
158  void setVerbatimLineID(unsigned ID) {
159  assert(is(tok::verbatim_line_name));
160  IntVal = ID;
161  }
162 
163  StringRef getVerbatimLineText() const LLVM_READONLY {
164  assert(is(tok::verbatim_line_text));
165  return StringRef(TextPtr, IntVal);
166  }
167 
168  void setVerbatimLineText(StringRef Text) {
169  assert(is(tok::verbatim_line_text));
170  TextPtr = Text.data();
171  IntVal = Text.size();
172  }
173 
174  StringRef getHTMLTagStartName() const LLVM_READONLY {
175  assert(is(tok::html_start_tag));
176  return StringRef(TextPtr, IntVal);
177  }
178 
179  void setHTMLTagStartName(StringRef Name) {
180  assert(is(tok::html_start_tag));
181  TextPtr = Name.data();
182  IntVal = Name.size();
183  }
184 
185  StringRef getHTMLIdent() const LLVM_READONLY {
186  assert(is(tok::html_ident));
187  return StringRef(TextPtr, IntVal);
188  }
189 
190  void setHTMLIdent(StringRef Name) {
191  assert(is(tok::html_ident));
192  TextPtr = Name.data();
193  IntVal = Name.size();
194  }
195 
196  StringRef getHTMLQuotedString() const LLVM_READONLY {
197  assert(is(tok::html_quoted_string));
198  return StringRef(TextPtr, IntVal);
199  }
200 
201  void setHTMLQuotedString(StringRef Str) {
202  assert(is(tok::html_quoted_string));
203  TextPtr = Str.data();
204  IntVal = Str.size();
205  }
206 
207  StringRef getHTMLTagEndName() const LLVM_READONLY {
208  assert(is(tok::html_end_tag));
209  return StringRef(TextPtr, IntVal);
210  }
211 
212  void setHTMLTagEndName(StringRef Name) {
213  assert(is(tok::html_end_tag));
214  TextPtr = Name.data();
215  IntVal = Name.size();
216  }
217 
218  void dump(const Lexer &L, const SourceManager &SM) const;
219 };
220 
221 /// \brief Comment lexer.
222 class Lexer {
223 private:
224  Lexer(const Lexer &) = delete;
225  void operator=(const Lexer &) = delete;
226 
227  /// Allocator for strings that are semantic values of tokens and have to be
228  /// computed (for example, resolved decimal character references).
229  llvm::BumpPtrAllocator &Allocator;
230 
231  DiagnosticsEngine &Diags;
232 
233  const CommandTraits &Traits;
234 
235  const char *const BufferStart;
236  const char *const BufferEnd;
237  SourceLocation FileLoc;
238 
239  const char *BufferPtr;
240 
241  /// One past end pointer for the current comment. For BCPL comments points
242  /// to newline or BufferEnd, for C comments points to star in '*/'.
243  const char *CommentEnd;
244 
245  enum LexerCommentState {
246  LCS_BeforeComment,
247  LCS_InsideBCPLComment,
248  LCS_InsideCComment,
249  LCS_BetweenComments
250  };
251 
252  /// Low-level lexer state, track if we are inside or outside of comment.
253  LexerCommentState CommentState;
254 
255  enum LexerState {
256  /// Lexing normal comment text
257  LS_Normal,
258 
259  /// Finished lexing verbatim block beginning command, will lex first body
260  /// line.
261  LS_VerbatimBlockFirstLine,
262 
263  /// Lexing verbatim block body line-by-line, skipping line-starting
264  /// decorations.
265  LS_VerbatimBlockBody,
266 
267  /// Finished lexing verbatim line beginning command, will lex text (one
268  /// line).
269  LS_VerbatimLineText,
270 
271  /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
272  LS_HTMLStartTag,
273 
274  /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
275  LS_HTMLEndTag
276  };
277 
278  /// Current lexing mode.
279  LexerState State;
280 
281  /// If State is LS_VerbatimBlock, contains the name of verbatim end
282  /// command, including command marker.
283  SmallString<16> VerbatimBlockEndCommandName;
284 
285  /// Given a character reference name (e.g., "lt"), return the character that
286  /// it stands for (e.g., "<").
287  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
288 
289  /// Given a Unicode codepoint as base-10 integer, return the character.
290  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
291 
292  /// Given a Unicode codepoint as base-16 integer, return the character.
293  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
294 
295  void formTokenWithChars(Token &Result, const char *TokEnd,
297 
298  void formTextToken(Token &Result, const char *TokEnd) {
299  StringRef Text(BufferPtr, TokEnd - BufferPtr);
300  formTokenWithChars(Result, TokEnd, tok::text);
301  Result.setText(Text);
302  }
303 
304  SourceLocation getSourceLocation(const char *Loc) const {
305  assert(Loc >= BufferStart && Loc <= BufferEnd &&
306  "Location out of range for this buffer!");
307 
308  const unsigned CharNo = Loc - BufferStart;
309  return FileLoc.getLocWithOffset(CharNo);
310  }
311 
312  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
313  return Diags.Report(Loc, DiagID);
314  }
315 
316  /// Eat string matching regexp \code \s*\* \endcode.
317  void skipLineStartingDecorations();
318 
319  /// Lex stuff inside comments. CommentEnd should be set correctly.
320  void lexCommentText(Token &T);
321 
322  void setupAndLexVerbatimBlock(Token &T,
323  const char *TextBegin,
324  char Marker, const CommandInfo *Info);
325 
326  void lexVerbatimBlockFirstLine(Token &T);
327 
328  void lexVerbatimBlockBody(Token &T);
329 
330  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
331  const CommandInfo *Info);
332 
333  void lexVerbatimLineText(Token &T);
334 
335  void lexHTMLCharacterReference(Token &T);
336 
337  void setupAndLexHTMLStartTag(Token &T);
338 
339  void lexHTMLStartTag(Token &T);
340 
341  void setupAndLexHTMLEndTag(Token &T);
342 
343  void lexHTMLEndTag(Token &T);
344 
345 public:
346  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
347  const CommandTraits &Traits,
348  SourceLocation FileLoc,
349  const char *BufferStart, const char *BufferEnd);
350 
351  void lex(Token &T);
352 
353  StringRef getSpelling(const Token &Tok,
354  const SourceManager &SourceMgr,
355  bool *Invalid = nullptr) const;
356 };
357 
358 } // end namespace comments
359 } // end namespace clang
360 
361 #endif
362 
void setHTMLQuotedString(StringRef Str)
Definition: CommentLexer.h:201
void setHTMLTagStartName(StringRef Name)
Definition: CommentLexer.h:179
unsigned Length
SourceLocation getEndLocation() const LLVM_READONLY
Definition: CommentLexer.h:85
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:97
Defines the SourceManager interface.
unsigned getVerbatimBlockID() const LLVM_READONLY
Definition: CommentLexer.h:132
void setLength(unsigned L)
Definition: CommentLexer.h:98
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1124
void setText(StringRef Text)
Definition: CommentLexer.h:105
StringRef getHTMLTagEndName() const LLVM_READONLY
Definition: CommentLexer.h:207
void setVerbatimLineText(StringRef Text)
Definition: CommentLexer.h:168
Information about a single command.
tok::TokenKind getKind() const LLVM_READONLY
Definition: CommentLexer.h:91
class LLVM_ALIGNAS(8) DependentTemplateSpecializationType const IdentifierInfo * Name
Represents a template specialization type whose template cannot be resolved, e.g. ...
Definition: Type.h:4549
StringRef getVerbatimLineText() const LLVM_READONLY
Definition: CommentLexer.h:163
void setCommandID(unsigned ID)
Definition: CommentLexer.h:127
void dump(const Lexer &L, const SourceManager &SM) const
bool isNot(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:95
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
void setVerbatimBlockID(unsigned ID)
Definition: CommentLexer.h:137
StringRef getHTMLTagStartName() const LLVM_READONLY
Definition: CommentLexer.h:174
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:135
void setLocation(SourceLocation SL)
Definition: CommentLexer.h:83
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:873
unsigned getVerbatimLineID() const LLVM_READONLY
Definition: CommentLexer.h:153
void setUnknownCommandName(StringRef Name)
Definition: CommentLexer.h:116
void setVerbatimLineID(unsigned ID)
Definition: CommentLexer.h:158
void setHTMLTagEndName(StringRef Name)
Definition: CommentLexer.h:212
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:82
StringRef getHTMLQuotedString() const LLVM_READONLY
Definition: CommentLexer.h:196
The result type of a method or function.
const SourceManager & SM
Definition: Format.cpp:1184
bool is(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:94
void setVerbatimBlockText(StringRef Text)
Definition: CommentLexer.h:147
This class provides information about commands that can be used in comments.
Kind
Re-lexes a sequence of tok::text tokens.
Encodes a location in the source.
const std::string ID
Comment lexer.
Definition: CommentLexer.h:222
void setKind(tok::TokenKind K)
Definition: CommentLexer.h:92
Defines the Diagnostic-related interfaces.
unsigned getCommandID() const LLVM_READONLY
Definition: CommentLexer.h:122
StringRef getUnknownCommandName() const LLVM_READONLY
Definition: CommentLexer.h:111
Comment token.
Definition: CommentLexer.h:57
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr, bool *Invalid=nullptr) const
StringRef getVerbatimBlockText() const LLVM_READONLY
Definition: CommentLexer.h:142
StringRef getText() const LLVM_READONLY
Definition: CommentLexer.h:100
StringRef Text
Definition: Format.cpp:1195
void setHTMLIdent(StringRef Name)
Definition: CommentLexer.h:190
This class handles loading and caching of source files into memory.
StringRef getHTMLIdent() const LLVM_READONLY
Definition: CommentLexer.h:185