21 #include "llvm/Support/Regex.h"
29 : FormatTok(nullptr), IsFirstToken(
true), GreaterStashed(
false),
30 LessStashed(
false), Column(0), TrailingWhitespace(0),
31 SourceMgr(SourceMgr), ID(ID), Style(Style),
33 Encoding(Encoding), FirstInLineIndex(0), FormattingDisabled(
false),
34 MacroBlockBeginRegex(Style.MacroBlockBegin),
35 MacroBlockEndRegex(Style.MacroBlockEnd) {
38 Lex->SetKeepWhitespaceMode(
true);
41 ForEachMacros.push_back(&IdentTable.
get(ForEachMacro));
42 std::sort(ForEachMacros.begin(), ForEachMacros.end());
46 assert(Tokens.empty());
47 assert(FirstInLineIndex == 0);
49 Tokens.push_back(getNextToken());
51 tryParseJSRegexLiteral();
52 tryParseTemplateString();
54 tryMergePreviousTokens();
55 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
56 FirstInLineIndex = Tokens.size() - 1;
57 }
while (Tokens.back()->Tok.isNot(
tok::eof));
61 void FormatTokenLexer::tryMergePreviousTokens() {
62 if (tryMerge_TMacro())
64 if (tryMergeConflictMarkers())
66 if (tryMergeLessLess())
70 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
73 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
75 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
77 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
79 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
81 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
83 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
88 bool FormatTokenLexer::tryMergeLessLess() {
90 if (Tokens.size() < 3)
93 bool FourthTokenIsLess =
false;
94 if (Tokens.size() > 3)
95 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
97 auto First = Tokens.end() - 3;
98 if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
99 First[0]->isNot(tok::less) || FourthTokenIsLess)
103 if (First[1]->WhitespaceRange.getBegin() !=
104 First[1]->WhitespaceRange.getEnd())
107 First[0]->Tok.setKind(tok::lessless);
108 First[0]->TokenText =
"<<";
109 First[0]->ColumnWidth += 1;
110 Tokens.erase(Tokens.end() - 2);
114 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
116 if (Tokens.size() < Kinds.size())
119 SmallVectorImpl<FormatToken *>::const_iterator First =
120 Tokens.end() - Kinds.size();
121 if (!First[0]->is(Kinds[0]))
123 unsigned AddLength = 0;
124 for (
unsigned i = 1; i < Kinds.size(); ++i) {
125 if (!First[i]->is(Kinds[i]) ||
126 First[i]->WhitespaceRange.getBegin() !=
127 First[i]->WhitespaceRange.getEnd())
129 AddLength += First[i]->TokenText.size();
131 Tokens.resize(Tokens.size() - Kinds.size() + 1);
132 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
133 First[0]->TokenText.size() + AddLength);
134 First[0]->ColumnWidth += AddLength;
135 First[0]->Type = NewType;
140 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
144 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
145 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
146 tok::colon, tok::question, tok::tilde) ||
147 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
148 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
150 Tok->isBinaryOperator();
153 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
161 if (Prev->isOneOf(tok::plusplus, tok::minusminus))
162 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
166 if (!precedesOperand(Prev))
176 void FormatTokenLexer::tryParseJSRegexLiteral() {
177 FormatToken *RegexToken = Tokens.back();
178 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
181 FormatToken *Prev =
nullptr;
182 for (
auto I = Tokens.rbegin() + 1,
E = Tokens.rend();
I !=
E; ++
I) {
185 if ((*I)->isNot(tok::comment)) {
191 if (!canPrecedeRegexLiteral(Prev))
195 const char *
Offset = Lex->getBufferLocation();
196 const char *RegexBegin = Offset - RegexToken->TokenText.size();
197 StringRef
Buffer = Lex->getBuffer();
198 bool InCharacterClass =
false;
199 bool HaveClosingSlash =
false;
200 for (; !HaveClosingSlash && Offset != Buffer.end(); ++
Offset) {
210 InCharacterClass =
true;
213 InCharacterClass =
false;
216 if (!InCharacterClass)
217 HaveClosingSlash =
true;
222 RegexToken->Type = TT_RegexLiteral;
224 RegexToken->Tok.setKind(tok::string_literal);
225 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
226 RegexToken->ColumnWidth = RegexToken->TokenText.size();
228 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset)));
231 void FormatTokenLexer::tryParseTemplateString() {
232 FormatToken *BacktickToken = Tokens.back();
233 if (!BacktickToken->is(tok::unknown) || BacktickToken->TokenText !=
"`")
237 const char *Offset = Lex->getBufferLocation();
238 const char *TmplBegin = Offset - BacktickToken->TokenText.size();
239 for (; Offset != Lex->getBuffer().end() && *Offset !=
'`'; ++
Offset) {
244 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
245 BacktickToken->Type = TT_TemplateString;
246 BacktickToken->Tok.setKind(tok::string_literal);
247 BacktickToken->TokenText = LiteralText;
250 size_t FirstBreak = LiteralText.find(
'\n');
251 StringRef FirstLineText = FirstBreak == StringRef::npos
253 : LiteralText.substr(0, FirstBreak);
255 FirstLineText, BacktickToken->OriginalColumn, Style.
TabWidth, Encoding);
256 size_t LastBreak = LiteralText.rfind(
'\n');
257 if (LastBreak != StringRef::npos) {
258 BacktickToken->IsMultiline =
true;
259 unsigned StartColumn = 0;
261 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
265 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset + 1)));
268 bool FormatTokenLexer::tryMerge_TMacro() {
269 if (Tokens.size() < 4)
271 FormatToken *
Last = Tokens.back();
272 if (!Last->is(tok::r_paren))
275 FormatToken *String = Tokens[Tokens.size() - 2];
276 if (!String->is(tok::string_literal) || String->IsMultiline)
279 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
282 FormatToken *
Macro = Tokens[Tokens.size() - 4];
283 if (Macro->TokenText !=
"_T")
286 const char *Start = Macro->TokenText.data();
287 const char *
End = Last->TokenText.data() + Last->TokenText.size();
288 String->TokenText = StringRef(Start, End - Start);
289 String->IsFirst = Macro->IsFirst;
290 String->LastNewlineOffset = Macro->LastNewlineOffset;
291 String->WhitespaceRange = Macro->WhitespaceRange;
292 String->OriginalColumn = Macro->OriginalColumn;
294 String->TokenText, String->OriginalColumn, Style.
TabWidth, Encoding);
295 String->NewlinesBefore = Macro->NewlinesBefore;
296 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
301 Tokens.back() = String;
305 bool FormatTokenLexer::tryMergeConflictMarkers() {
306 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(
tok::eof))
320 unsigned FirstInLineOffset;
322 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
323 StringRef Buffer = SourceMgr.
getBuffer(ID)->getBuffer();
325 auto LineOffset = Buffer.rfind(
'\n', FirstInLineOffset);
326 if (LineOffset == StringRef::npos) {
332 auto FirstSpace = Buffer.find_first_of(
" \n", LineOffset);
334 if (FirstSpace == StringRef::npos) {
335 LineStart = Buffer.substr(LineOffset);
337 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
341 if (LineStart ==
"<<<<<<<" || LineStart ==
">>>>") {
342 Type = TT_ConflictStart;
343 }
else if (LineStart ==
"|||||||" || LineStart ==
"=======" ||
344 LineStart ==
"====") {
345 Type = TT_ConflictAlternative;
346 }
else if (LineStart ==
">>>>>>>" || LineStart ==
"<<<<") {
347 Type = TT_ConflictEnd;
350 if (Type != TT_Unknown) {
351 FormatToken *
Next = Tokens.back();
353 Tokens.resize(FirstInLineIndex + 1);
357 Tokens.back()->Type = Type;
358 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
360 Tokens.push_back(Next);
367 FormatToken *FormatTokenLexer::getStashedToken() {
370 StringRef TokenText = FormatTok->
TokenText;
373 FormatTok =
new (Allocator.Allocate()) FormatToken;
374 FormatTok->
Tok = Tok;
375 SourceLocation TokLocation =
386 FormatToken *FormatTokenLexer::getNextToken() {
387 if (GreaterStashed) {
388 GreaterStashed =
false;
389 return getStashedToken();
393 return getStashedToken();
396 FormatTok =
new (Allocator.Allocate()) FormatToken;
397 readRawToken(*FormatTok);
398 SourceLocation WhitespaceStart =
400 FormatTok->
IsFirst = IsFirstToken;
401 IsFirstToken =
false;
404 unsigned WhitespaceLength = TrailingWhitespace;
405 while (FormatTok->
Tok.
is(tok::unknown)) {
407 auto EscapesNewline = [&](
int pos) {
409 if (pos >= 0 && Text[pos] ==
'\r')
413 for (; pos >= 0; --pos, ++count)
414 if (Text[pos] !=
'\\')
420 for (
int i = 0, e = Text.size(); i != e; ++i) {
443 if (i + 1 == e || (Text[i + 1] !=
'\r' && Text[i + 1] !=
'\n'))
444 FormatTok->
Type = TT_ImplicitStringLiteral;
447 FormatTok->
Type = TT_ImplicitStringLiteral;
450 if (FormatTok->
Type == TT_ImplicitStringLiteral)
454 if (FormatTok->
is(TT_ImplicitStringLiteral))
458 readRawToken(*FormatTok);
468 WhitespaceLength += 2;
475 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
479 TrailingWhitespace = 0;
480 if (FormatTok->
Tok.
is(tok::comment)) {
482 StringRef UntrimmedText = FormatTok->
TokenText;
484 TrailingWhitespace = UntrimmedText.size() - FormatTok->
TokenText.size();
485 }
else if (FormatTok->
Tok.
is(tok::raw_identifier)) {
486 IdentifierInfo &Info = IdentTable.
get(FormatTok->
TokenText);
490 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
495 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union,
500 }
else if (FormatTok->
Tok.
is(tok::greatergreater)) {
503 GreaterStashed =
true;
504 }
else if (FormatTok->
Tok.
is(tok::lessless)) {
513 size_t FirstNewlinePos = Text.find(
'\n');
514 if (FirstNewlinePos == StringRef::npos) {
525 Text.substr(0, FirstNewlinePos), Column, Style.
TabWidth, Encoding);
530 Text.substr(Text.find_last_of(
'\n') + 1), 0, Style.
TabWidth, Encoding);
535 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
536 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
538 std::find(ForEachMacros.begin(), ForEachMacros.end(),
540 FormatTok->
Type = TT_ForEachMacro;
541 }
else if (FormatTok->
is(tok::identifier)) {
542 if (MacroBlockBeginRegex.match(Text)) {
543 FormatTok->
Type = TT_MacroBlockBegin;
544 }
else if (MacroBlockEndRegex.match(Text)) {
545 FormatTok->
Type = TT_MacroBlockEnd;
553 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
554 Lex->LexFromRawLexer(Tok.Tok);
555 Tok.TokenText = StringRef(SourceMgr.
getCharacterData(Tok.Tok.getLocation()),
556 Tok.Tok.getLength());
559 if (Tok.is(tok::unknown)) {
560 if (!Tok.TokenText.empty() && Tok.TokenText[0] ==
'"') {
561 Tok.Tok.setKind(tok::string_literal);
562 Tok.IsUnterminatedLiteral =
true;
564 Tok.TokenText ==
"''") {
565 Tok.Tok.setKind(tok::string_literal);
570 Tok.is(tok::char_constant)) {
571 Tok.Tok.setKind(tok::string_literal);
574 if (Tok.is(tok::comment) && (Tok.TokenText ==
"// clang-format on" ||
575 Tok.TokenText ==
"/* clang-format on */")) {
576 FormattingDisabled =
false;
579 Tok.Finalized = FormattingDisabled;
581 if (Tok.is(tok::comment) && (Tok.TokenText ==
"// clang-format off" ||
582 Tok.TokenText ==
"/* clang-format off */")) {
583 FormattingDisabled =
true;
587 void FormatTokenLexer::resetLexer(
unsigned Offset) {
591 Buffer.begin() +
Offset, Buffer.end()));
592 Lex->SetKeepWhitespaceMode(
true);
593 TrailingWhitespace = 0;
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Defines the SourceManager interface.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
std::unique_ptr< llvm::MemoryBuffer > Buffer
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
void setKind(tok::TokenKind K)
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
detail::InMemoryDirectory::const_iterator I
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
The l-value was considered opaque, so the alignment was determined from a type.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
void setIdentifierInfo(IdentifierInfo *II)
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
const AdditionalKeywords & Keywords
detail::InMemoryDirectory::const_iterator E
Defines the clang::SourceLocation class and associated facilities.
unsigned getLength() const
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
void setLocation(SourceLocation L)
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
This class handles loading and caching of source files into memory.
IdentifierInfo * getIdentifierInfo() const