14 #include "llvm/ADT/StringExtras.h"
15 #include "llvm/ADT/StringSwitch.h"
16 #include "llvm/Support/ConvertUTF.h"
17 #include "llvm/Support/ErrorHandling.h"
23 llvm::errs() <<
"comments::Token Kind=" <<
Kind <<
" ";
25 llvm::errs() <<
" " << Length <<
" \"" << L.
getSpelling(*
this, SM) <<
"\"\n";
43 char *Resolved = Allocator.Allocate<
char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
44 char *ResolvedPtr = Resolved;
45 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
46 return StringRef(Resolved, ResolvedPtr - Resolved);
53 #include "clang/AST/CommentHTMLTags.inc"
54 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
58 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef
Name)
const {
60 return llvm::StringSwitch<StringRef>(
Name)
67 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
70 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name)
const {
71 unsigned CodePoint = 0;
72 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
75 CodePoint += Name[i] -
'0';
80 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name)
const {
81 unsigned CodePoint = 0;
82 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
84 const char C = Name[i];
86 CodePoint += llvm::hexDigitValue(C);
91 void Lexer::skipLineStartingDecorations() {
93 assert(CommentState == LCS_InsideCComment);
95 if (BufferPtr == CommentEnd)
103 const char *NewBufferPtr = BufferPtr;
105 if (NewBufferPtr == CommentEnd)
108 char C = *NewBufferPtr;
111 if (NewBufferPtr == CommentEnd)
116 BufferPtr = NewBufferPtr + 1;
127 const char *findNewline(
const char *BufferPtr,
const char *BufferEnd) {
128 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
135 const char *skipNewline(
const char *BufferPtr,
const char *BufferEnd) {
136 if (BufferPtr == BufferEnd)
139 if (*BufferPtr ==
'\n')
142 assert(*BufferPtr ==
'\r');
144 if (BufferPtr != BufferEnd && *BufferPtr ==
'\n')
150 const char *skipNamedCharacterReference(
const char *BufferPtr,
151 const char *BufferEnd) {
152 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
159 const char *skipDecimalCharacterReference(
const char *BufferPtr,
160 const char *BufferEnd) {
161 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
168 const char *skipHexCharacterReference(
const char *BufferPtr,
169 const char *BufferEnd) {
170 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
177 bool isHTMLIdentifierStartingCharacter(
char C) {
181 bool isHTMLIdentifierCharacter(
char C) {
185 const char *skipHTMLIdentifier(
const char *BufferPtr,
const char *BufferEnd) {
186 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
187 if (!isHTMLIdentifierCharacter(*BufferPtr))
197 const char *skipHTMLQuotedString(
const char *BufferPtr,
const char *BufferEnd)
199 const char Quote = *BufferPtr;
200 assert(Quote ==
'\"' || Quote ==
'\'');
203 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204 const char C = *BufferPtr;
205 if (C == Quote && BufferPtr[-1] !=
'\\')
211 const char *
skipWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
212 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219 bool isWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
223 bool isCommandNameStartCharacter(
char C) {
227 bool isCommandNameCharacter(
char C) {
231 const char *skipCommandName(
const char *BufferPtr,
const char *BufferEnd) {
232 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
233 if (!isCommandNameCharacter(*BufferPtr))
241 const char *findBCPLCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
242 const char *CurPtr = BufferPtr;
243 while (CurPtr != BufferEnd) {
246 if (CurPtr == BufferEnd)
250 const char *EscapePtr = CurPtr - 1;
254 if (*EscapePtr ==
'\\' ||
255 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] ==
'/' &&
256 EscapePtr[-1] ==
'?' && EscapePtr[-2] ==
'?')) {
258 CurPtr = skipNewline(CurPtr, BufferEnd);
267 const char *findCCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
268 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
269 if (*BufferPtr ==
'*') {
270 assert(BufferPtr + 1 != BufferEnd);
271 if (*(BufferPtr + 1) ==
'/')
275 llvm_unreachable(
"buffer end hit before '*/' was seen");
280 void Lexer::formTokenWithChars(
Token &Result,
const char *TokEnd,
282 const unsigned TokLen = TokEnd - BufferPtr;
283 Result.setLocation(getSourceLocation(BufferPtr));
284 Result.setKind(Kind);
285 Result.setLength(TokLen);
287 Result.TextPtr =
"<UNSET>";
293 void Lexer::lexCommentText(
Token &T) {
294 assert(CommentState == LCS_InsideBCPLComment ||
295 CommentState == LCS_InsideCComment);
300 case LS_VerbatimBlockFirstLine:
301 lexVerbatimBlockFirstLine(T);
303 case LS_VerbatimBlockBody:
304 lexVerbatimBlockBody(T);
306 case LS_VerbatimLineText:
307 lexVerbatimLineText(T);
309 case LS_HTMLStartTag:
317 assert(State == LS_Normal);
319 const char *TokenPtr = BufferPtr;
320 assert(TokenPtr < CommentEnd);
321 while (TokenPtr != CommentEnd) {
331 if (TokenPtr == CommentEnd) {
332 formTextToken(T, TokenPtr);
340 case '\\':
case '@':
case '&':
case '$':
341 case '#':
case '<':
case '>':
case '%':
342 case '\"':
case '.':
case ':':
345 if (C ==
':' && TokenPtr != CommentEnd && *TokenPtr ==
':') {
349 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
350 formTokenWithChars(T, TokenPtr,
tok::text);
351 T.setText(UnescapedText);
356 if (!isCommandNameStartCharacter(*TokenPtr)) {
357 formTextToken(T, TokenPtr);
361 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
362 unsigned Length = TokenPtr - (BufferPtr + 1);
366 if (Length == 1 && TokenPtr[-1] ==
'f' && TokenPtr != CommentEnd) {
368 if (C ==
'$' || C ==
'[' || C ==
']' || C ==
'{' || C ==
'}') {
374 StringRef CommandName(BufferPtr + 1, Length);
379 StringRef CorrectedName = Info->
Name;
380 SourceLocation Loc = getSourceLocation(BufferPtr);
381 SourceRange CommandRange(Loc.getLocWithOffset(1),
382 getSourceLocation(TokenPtr));
383 Diag(Loc, diag::warn_correct_comment_command_name)
384 << CommandName << CorrectedName
388 T.setUnknownCommandName(CommandName);
389 Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
393 if (Info->IsVerbatimBlockCommand) {
394 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
397 if (Info->IsVerbatimLineCommand) {
398 setupAndLexVerbatimLine(T, TokenPtr, Info);
401 formTokenWithChars(T, TokenPtr, CommandKind);
402 T.setCommandID(Info->getID());
407 lexHTMLCharacterReference(T);
412 if (TokenPtr == CommentEnd) {
413 formTextToken(T, TokenPtr);
416 const char C = *TokenPtr;
417 if (isHTMLIdentifierStartingCharacter(C))
418 setupAndLexHTMLStartTag(T);
420 setupAndLexHTMLEndTag(T);
422 formTextToken(T, TokenPtr);
428 TokenPtr = skipNewline(TokenPtr, CommentEnd);
431 if (CommentState == LCS_InsideCComment)
432 skipLineStartingDecorations();
436 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
437 find_first_of(
"\n\r\\@&<");
438 if (End != StringRef::npos)
441 TokenPtr = CommentEnd;
442 formTextToken(T, TokenPtr);
449 void Lexer::setupAndLexVerbatimBlock(
Token &T,
450 const char *TextBegin,
451 char Marker,
const CommandInfo *Info) {
452 assert(Info->IsVerbatimBlockCommand);
454 VerbatimBlockEndCommandName.clear();
455 VerbatimBlockEndCommandName.append(Marker ==
'\\' ?
"\\" :
"@");
456 VerbatimBlockEndCommandName.append(Info->EndCommandName);
459 T.setVerbatimBlockID(Info->getID());
464 if (BufferPtr != CommentEnd &&
466 BufferPtr = skipNewline(BufferPtr, CommentEnd);
467 State = LS_VerbatimBlockBody;
471 State = LS_VerbatimBlockFirstLine;
474 void Lexer::lexVerbatimBlockFirstLine(
Token &T) {
476 assert(BufferPtr < CommentEnd);
482 const char *Newline = findNewline(BufferPtr, CommentEnd);
483 StringRef
Line(BufferPtr, Newline - BufferPtr);
486 size_t Pos =
Line.find(VerbatimBlockEndCommandName);
488 const char *NextLine;
489 if (Pos == StringRef::npos) {
492 NextLine = skipNewline(Newline, CommentEnd);
493 }
else if (Pos == 0) {
495 const char *
End = BufferPtr + VerbatimBlockEndCommandName.size();
496 StringRef
Name(BufferPtr + 1, End - (BufferPtr + 1));
503 TextEnd = BufferPtr + Pos;
512 StringRef
Text(BufferPtr, TextEnd - BufferPtr);
514 T.setVerbatimBlockText(
Text);
516 State = LS_VerbatimBlockBody;
519 void Lexer::lexVerbatimBlockBody(
Token &T) {
520 assert(State == LS_VerbatimBlockBody);
522 if (CommentState == LCS_InsideCComment)
523 skipLineStartingDecorations();
525 if (BufferPtr == CommentEnd) {
527 T.setVerbatimBlockText(
"");
531 lexVerbatimBlockFirstLine(T);
534 void Lexer::setupAndLexVerbatimLine(
Token &T,
const char *TextBegin,
535 const CommandInfo *Info) {
536 assert(Info->IsVerbatimLineCommand);
538 T.setVerbatimLineID(Info->getID());
540 State = LS_VerbatimLineText;
543 void Lexer::lexVerbatimLineText(
Token &T) {
544 assert(State == LS_VerbatimLineText);
547 const char *Newline = findNewline(BufferPtr, CommentEnd);
548 StringRef
Text(BufferPtr, Newline - BufferPtr);
550 T.setVerbatimLineText(
Text);
555 void Lexer::lexHTMLCharacterReference(
Token &T) {
556 const char *TokenPtr = BufferPtr;
557 assert(*TokenPtr ==
'&');
559 if (TokenPtr == CommentEnd) {
560 formTextToken(T, TokenPtr);
565 bool isDecimal =
false;
569 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
571 }
else if (C ==
'#') {
573 if (TokenPtr == CommentEnd) {
574 formTextToken(T, TokenPtr);
580 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
582 }
else if (C ==
'x' || C ==
'X') {
585 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
587 formTextToken(T, TokenPtr);
591 formTextToken(T, TokenPtr);
594 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
596 formTextToken(T, TokenPtr);
599 StringRef
Name(NamePtr, TokenPtr - NamePtr);
603 Resolved = resolveHTMLNamedCharacterReference(Name);
605 Resolved = resolveHTMLDecimalCharacterReference(Name);
607 Resolved = resolveHTMLHexCharacterReference(Name);
609 if (Resolved.empty()) {
610 formTextToken(T, TokenPtr);
613 formTokenWithChars(T, TokenPtr,
tok::text);
617 void Lexer::setupAndLexHTMLStartTag(
Token &T) {
618 assert(BufferPtr[0] ==
'<' &&
619 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
620 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
621 StringRef
Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
622 if (!isHTMLTagName(Name)) {
623 formTextToken(T, TagNameEnd);
628 T.setHTMLTagStartName(Name);
632 const char C = *BufferPtr;
633 if (BufferPtr != CommentEnd &&
634 (C ==
'>' || C ==
'/' || isHTMLIdentifierStartingCharacter(C)))
635 State = LS_HTMLStartTag;
638 void Lexer::lexHTMLStartTag(
Token &T) {
639 assert(State == LS_HTMLStartTag);
641 const char *TokenPtr = BufferPtr;
643 if (isHTMLIdentifierCharacter(C)) {
644 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
645 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
647 T.setHTMLIdent(Ident);
656 const char *OpenQuote = TokenPtr;
657 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
658 const char *ClosingQuote = TokenPtr;
659 if (TokenPtr != CommentEnd)
662 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
663 ClosingQuote - (OpenQuote + 1)));
673 if (TokenPtr != CommentEnd && *TokenPtr ==
'>') {
677 formTextToken(T, TokenPtr);
687 if (BufferPtr == CommentEnd) {
693 if (!isHTMLIdentifierStartingCharacter(C) &&
694 C !=
'=' && C !=
'\"' && C !=
'\'' && C !=
'>') {
700 void Lexer::setupAndLexHTMLEndTag(
Token &T) {
701 assert(BufferPtr[0] ==
'<' && BufferPtr[1] ==
'/');
703 const char *TagNameBegin =
skipWhitespace(BufferPtr + 2, CommentEnd);
704 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
705 StringRef
Name(TagNameBegin, TagNameEnd - TagNameBegin);
706 if (!isHTMLTagName(Name)) {
707 formTextToken(T, TagNameEnd);
714 T.setHTMLTagEndName(Name);
716 if (BufferPtr != CommentEnd && *BufferPtr ==
'>')
717 State = LS_HTMLEndTag;
720 void Lexer::lexHTMLEndTag(
Token &T) {
721 assert(BufferPtr != CommentEnd && *BufferPtr ==
'>');
730 const char *BufferStart,
const char *BufferEnd):
731 Allocator(Allocator), Diags(Diags), Traits(Traits),
732 BufferStart(BufferStart), BufferEnd(BufferEnd),
733 FileLoc(FileLoc), BufferPtr(BufferStart),
734 CommentState(LCS_BeforeComment),
State(LS_Normal) {
739 switch (CommentState) {
740 case LCS_BeforeComment:
741 if (BufferPtr == BufferEnd) {
742 formTokenWithChars(T, BufferPtr,
tok::eof);
746 assert(*BufferPtr ==
'/');
752 if (BufferPtr != BufferEnd) {
757 const char C = *BufferPtr;
758 if (C ==
'/' || C ==
'!')
765 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
768 CommentState = LCS_InsideBCPLComment;
769 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
771 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
778 const char C = *BufferPtr;
779 if ((C ==
'*' && *(BufferPtr + 1) !=
'/') || C ==
'!')
783 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
786 CommentState = LCS_InsideCComment;
788 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
792 llvm_unreachable(
"second character of comment should be '/' or '*'");
795 case LCS_BetweenComments: {
798 const char *EndWhitespace = BufferPtr;
799 while(EndWhitespace != BufferEnd && *EndWhitespace !=
'/')
808 CommentState = LCS_BeforeComment;
812 case LCS_InsideBCPLComment:
813 case LCS_InsideCComment:
814 if (BufferPtr != CommentEnd) {
819 if (CommentState == LCS_InsideCComment) {
820 assert(BufferPtr[0] ==
'*' && BufferPtr[1] ==
'/');
822 assert(BufferPtr <= BufferEnd);
828 CommentState = LCS_BetweenComments;
832 CommentState = LCS_BetweenComments;
841 bool *Invalid)
const {
845 bool InvalidTemp =
false;
846 StringRef File = SourceMgr.
getBufferData(LocInfo.first, &InvalidTemp);
852 const char *Begin = File.data() + LocInfo.second;
853 return StringRef(Begin, Tok.
getLength());
static LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
static LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.
static LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
class LLVM_ALIGNAS(8) DependentTemplateSpecializationType const IdentifierInfo * Name
Represents a template specialization type whose template cannot be resolved, e.g. ...
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
static LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Concrete class used by the front-end to report problems and issues.
void dump(const SourceManager &SM) const
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Encodes a location in the source.
static LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
static LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string...
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
This class handles loading and caching of source files into memory.
static LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].