Bug Summary

File:tools/clang/include/clang/Lex/Token.h
Warning:line 237, column 11
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Lexer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -relaxed-aliasing -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D CLANG_VENDOR="Debian " -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include -I /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/tools/clang/include -I /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/include -I /build/llvm-toolchain-snapshot-10~svn374877/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~svn374877/build-llvm/tools/clang/lib/Lex -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~svn374877=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fno-common -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2019-10-15-233810-7101-1 -x c++ /build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp

/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp

1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
15#include "clang/Basic/CharInfo.h"
16#include "clang/Basic/IdentifierTable.h"
17#include "clang/Basic/LangOptions.h"
18#include "clang/Basic/SourceLocation.h"
19#include "clang/Basic/SourceManager.h"
20#include "clang/Basic/TokenKinds.h"
21#include "clang/Lex/LexDiagnostic.h"
22#include "clang/Lex/LiteralSupport.h"
23#include "clang/Lex/MultipleIncludeOpt.h"
24#include "clang/Lex/Preprocessor.h"
25#include "clang/Lex/PreprocessorOptions.h"
26#include "clang/Lex/Token.h"
27#include "clang/Basic/Diagnostic.h"
28#include "clang/Basic/LLVM.h"
29#include "clang/Basic/TokenKinds.h"
30#include "llvm/ADT/None.h"
31#include "llvm/ADT/Optional.h"
32#include "llvm/ADT/StringExtras.h"
33#include "llvm/ADT/StringSwitch.h"
34#include "llvm/ADT/StringRef.h"
35#include "llvm/Support/Compiler.h"
36#include "llvm/Support/ConvertUTF.h"
37#include "llvm/Support/MathExtras.h"
38#include "llvm/Support/MemoryBuffer.h"
39#include "llvm/Support/NativeFormatting.h"
40#include "llvm/Support/UnicodeCharRanges.h"
41#include <algorithm>
42#include <cassert>
43#include <cstddef>
44#include <cstdint>
45#include <cstring>
46#include <string>
47#include <tuple>
48#include <utility>
49
50using namespace clang;
51
52//===----------------------------------------------------------------------===//
53// Token Class Implementation
54//===----------------------------------------------------------------------===//
55
56/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
57bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
58 if (isAnnotation())
59 return false;
60 if (IdentifierInfo *II = getIdentifierInfo())
61 return II->getObjCKeywordID() == objcKey;
62 return false;
63}
64
65/// getObjCKeywordID - Return the ObjC keyword kind.
66tok::ObjCKeywordKind Token::getObjCKeywordID() const {
67 if (isAnnotation())
68 return tok::objc_not_keyword;
69 IdentifierInfo *specId = getIdentifierInfo();
70 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
71}
72
73//===----------------------------------------------------------------------===//
74// Lexer Class Implementation
75//===----------------------------------------------------------------------===//
76
77void Lexer::anchor() {}
78
79void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
80 const char *BufEnd) {
81 BufferStart = BufStart;
82 BufferPtr = BufPtr;
83 BufferEnd = BufEnd;
84
85 assert(BufEnd[0] == 0 &&((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? static_cast<void> (0) : __assert_fail
("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 87, __PRETTY_FUNCTION__))
86 "We assume that the input buffer has a null character at the end"((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? static_cast<void> (0) : __assert_fail
("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 87, __PRETTY_FUNCTION__))
87 " to simplify lexing!")((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? static_cast<void> (0) : __assert_fail
("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 87, __PRETTY_FUNCTION__))
;
88
89 // Check whether we have a BOM in the beginning of the buffer. If yes - act
90 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
91 // skip the UTF-8 BOM if it's present.
92 if (BufferStart == BufferPtr) {
93 // Determine the size of the BOM.
94 StringRef Buf(BufferStart, BufferEnd - BufferStart);
95 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
96 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
97 .Default(0);
98
99 // Skip the BOM.
100 BufferPtr += BOMLength;
101 }
102
103 Is_PragmaLexer = false;
104 CurrentConflictMarkerState = CMK_None;
105
106 // Start of the file is a start of line.
107 IsAtStartOfLine = true;
108 IsAtPhysicalStartOfLine = true;
109
110 HasLeadingSpace = false;
111 HasLeadingEmptyMacro = false;
112
113 // We are not after parsing a #.
114 ParsingPreprocessorDirective = false;
115
116 // We are not after parsing #include.
117 ParsingFilename = false;
118
119 // We are not in raw mode. Raw mode disables diagnostics and interpretation
120 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
121 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
122 // or otherwise skipping over tokens.
123 LexingRawMode = false;
124
125 // Default to not keeping comments.
126 ExtendedTokenMode = 0;
127}
128
129/// Lexer constructor - Create a new lexer object for the specified buffer
130/// with the specified preprocessor managing the lexing process. This lexer
131/// assumes that the associated file buffer and Preprocessor objects will
132/// outlive it, so it doesn't take ownership of either of them.
133Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
134 : PreprocessorLexer(&PP, FID),
135 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
136 LangOpts(PP.getLangOpts()) {
137 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
138 InputFile->getBufferEnd());
139
140 resetExtendedTokenMode();
141}
142
143/// Lexer constructor - Create a new raw lexer object. This object is only
144/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
145/// range will outlive it, so it doesn't take ownership of it.
146Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
147 const char *BufStart, const char *BufPtr, const char *BufEnd)
148 : FileLoc(fileloc), LangOpts(langOpts) {
149 InitLexer(BufStart, BufPtr, BufEnd);
150
151 // We *are* in raw mode.
152 LexingRawMode = true;
153}
154
155/// Lexer constructor - Create a new raw lexer object. This object is only
156/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
157/// range will outlive it, so it doesn't take ownership of it.
158Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
159 const SourceManager &SM, const LangOptions &langOpts)
160 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
161 FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
162
163void Lexer::resetExtendedTokenMode() {
164 assert(PP && "Cannot reset token mode without a preprocessor")((PP && "Cannot reset token mode without a preprocessor"
) ? static_cast<void> (0) : __assert_fail ("PP && \"Cannot reset token mode without a preprocessor\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 164, __PRETTY_FUNCTION__))
;
165 if (LangOpts.TraditionalCPP)
166 SetKeepWhitespaceMode(true);
167 else
168 SetCommentRetentionState(PP->getCommentRetentionState());
169}
170
171/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
172/// _Pragma expansion. This has a variety of magic semantics that this method
173/// sets up. It returns a new'd Lexer that must be delete'd when done.
174///
175/// On entrance to this routine, TokStartLoc is a macro location which has a
176/// spelling loc that indicates the bytes to be lexed for the token and an
177/// expansion location that indicates where all lexed tokens should be
178/// "expanded from".
179///
180/// TODO: It would really be nice to make _Pragma just be a wrapper around a
181/// normal lexer that remaps tokens as they fly by. This would require making
182/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
183/// interface that could handle this stuff. This would pull GetMappedTokenLoc
184/// out of the critical path of the lexer!
185///
186Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
187 SourceLocation ExpansionLocStart,
188 SourceLocation ExpansionLocEnd,
189 unsigned TokLen, Preprocessor &PP) {
190 SourceManager &SM = PP.getSourceManager();
191
192 // Create the lexer as if we were going to lex the file normally.
193 FileID SpellingFID = SM.getFileID(SpellingLoc);
194 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
195 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
196
197 // Now that the lexer is created, change the start/end locations so that we
198 // just lex the subsection of the file that we want. This is lexing from a
199 // scratch buffer.
200 const char *StrData = SM.getCharacterData(SpellingLoc);
201
202 L->BufferPtr = StrData;
203 L->BufferEnd = StrData+TokLen;
204 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!")((L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"
) ? static_cast<void> (0) : __assert_fail ("L->BufferEnd[0] == 0 && \"Buffer is not nul terminated!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 204, __PRETTY_FUNCTION__))
;
205
206 // Set the SourceLocation with the remapping information. This ensures that
207 // GetMappedTokenLoc will remap the tokens as they are lexed.
208 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
209 ExpansionLocStart,
210 ExpansionLocEnd, TokLen);
211
212 // Ensure that the lexer thinks it is inside a directive, so that end \n will
213 // return an EOD token.
214 L->ParsingPreprocessorDirective = true;
215
216 // This lexer really is for _Pragma.
217 L->Is_PragmaLexer = true;
218 return L;
219}
220
221bool Lexer::skipOver(unsigned NumBytes) {
222 IsAtPhysicalStartOfLine = true;
223 IsAtStartOfLine = true;
224 if ((BufferPtr + NumBytes) > BufferEnd)
225 return true;
226 BufferPtr += NumBytes;
227 return false;
228}
229
230template <typename T> static void StringifyImpl(T &Str, char Quote) {
231 typename T::size_type i = 0, e = Str.size();
232 while (i < e) {
233 if (Str[i] == '\\' || Str[i] == Quote) {
234 Str.insert(Str.begin() + i, '\\');
235 i += 2;
236 ++e;
237 } else if (Str[i] == '\n' || Str[i] == '\r') {
238 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
239 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
240 Str[i] != Str[i + 1]) {
241 Str[i] = '\\';
242 Str[i + 1] = 'n';
243 } else {
244 // Replace '\n' and '\r' to '\\' followed by 'n'.
245 Str[i] = '\\';
246 Str.insert(Str.begin() + i + 1, 'n');
247 ++e;
248 }
249 i += 2;
250 } else
251 ++i;
252 }
253}
254
255std::string Lexer::Stringify(StringRef Str, bool Charify) {
256 std::string Result = Str;
257 char Quote = Charify ? '\'' : '"';
258 StringifyImpl(Result, Quote);
259 return Result;
260}
261
262void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
263
264//===----------------------------------------------------------------------===//
265// Token Spelling
266//===----------------------------------------------------------------------===//
267
268/// Slow case of getSpelling. Extract the characters comprising the
269/// spelling of this token from the provided input buffer.
270static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
271 const LangOptions &LangOpts, char *Spelling) {
272 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token")((Tok.needsCleaning() && "getSpellingSlow called on simple token"
) ? static_cast<void> (0) : __assert_fail ("Tok.needsCleaning() && \"getSpellingSlow called on simple token\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 272, __PRETTY_FUNCTION__))
;
273
274 size_t Length = 0;
275 const char *BufEnd = BufPtr + Tok.getLength();
276
277 if (tok::isStringLiteral(Tok.getKind())) {
278 // Munch the encoding-prefix and opening double-quote.
279 while (BufPtr < BufEnd) {
280 unsigned Size;
281 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
282 BufPtr += Size;
283
284 if (Spelling[Length - 1] == '"')
285 break;
286 }
287
288 // Raw string literals need special handling; trigraph expansion and line
289 // splicing do not occur within their d-char-sequence nor within their
290 // r-char-sequence.
291 if (Length >= 2 &&
292 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
293 // Search backwards from the end of the token to find the matching closing
294 // quote.
295 const char *RawEnd = BufEnd;
296 do --RawEnd; while (*RawEnd != '"');
297 size_t RawLength = RawEnd - BufPtr + 1;
298
299 // Everything between the quotes is included verbatim in the spelling.
300 memcpy(Spelling + Length, BufPtr, RawLength);
301 Length += RawLength;
302 BufPtr += RawLength;
303
304 // The rest of the token is lexed normally.
305 }
306 }
307
308 while (BufPtr < BufEnd) {
309 unsigned Size;
310 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
311 BufPtr += Size;
312 }
313
314 assert(Length < Tok.getLength() &&((Length < Tok.getLength() && "NeedsCleaning flag set on token that didn't need cleaning!"
) ? static_cast<void> (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 315, __PRETTY_FUNCTION__))
315 "NeedsCleaning flag set on token that didn't need cleaning!")((Length < Tok.getLength() && "NeedsCleaning flag set on token that didn't need cleaning!"
) ? static_cast<void> (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 315, __PRETTY_FUNCTION__))
;
316 return Length;
317}
318
319/// getSpelling() - Return the 'spelling' of this token. The spelling of a
320/// token are the characters used to represent the token in the source file
321/// after trigraph expansion and escaped-newline folding. In particular, this
322/// wants to get the true, uncanonicalized, spelling of things like digraphs
323/// UCNs, etc.
324StringRef Lexer::getSpelling(SourceLocation loc,
325 SmallVectorImpl<char> &buffer,
326 const SourceManager &SM,
327 const LangOptions &options,
328 bool *invalid) {
329 // Break down the source location.
330 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
331
332 // Try to the load the file buffer.
333 bool invalidTemp = false;
334 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
335 if (invalidTemp) {
336 if (invalid) *invalid = true;
337 return {};
338 }
339
340 const char *tokenBegin = file.data() + locInfo.second;
341
342 // Lex from the start of the given location.
343 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
344 file.begin(), tokenBegin, file.end());
345 Token token;
346 lexer.LexFromRawLexer(token);
347
348 unsigned length = token.getLength();
349
350 // Common case: no need for cleaning.
351 if (!token.needsCleaning())
352 return StringRef(tokenBegin, length);
353
354 // Hard case, we need to relex the characters into the string.
355 buffer.resize(length);
356 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
357 return StringRef(buffer.data(), buffer.size());
358}
359
360/// getSpelling() - Return the 'spelling' of this token. The spelling of a
361/// token are the characters used to represent the token in the source file
362/// after trigraph expansion and escaped-newline folding. In particular, this
363/// wants to get the true, uncanonicalized, spelling of things like digraphs
364/// UCNs, etc.
365std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
366 const LangOptions &LangOpts, bool *Invalid) {
367 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(((int)Tok.getLength() >= 0 && "Token character range is bogus!"
) ? static_cast<void> (0) : __assert_fail ("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 367, __PRETTY_FUNCTION__))
;
368
369 bool CharDataInvalid = false;
370 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
371 &CharDataInvalid);
372 if (Invalid)
373 *Invalid = CharDataInvalid;
374 if (CharDataInvalid)
375 return {};
376
377 // If this token contains nothing interesting, return it directly.
378 if (!Tok.needsCleaning())
379 return std::string(TokStart, TokStart + Tok.getLength());
380
381 std::string Result;
382 Result.resize(Tok.getLength());
383 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
384 return Result;
385}
386
387/// getSpelling - This method is used to get the spelling of a token into a
388/// preallocated buffer, instead of as an std::string. The caller is required
389/// to allocate enough space for the token, which is guaranteed to be at least
390/// Tok.getLength() bytes long. The actual length of the token is returned.
391///
392/// Note that this method may do two possible things: it may either fill in
393/// the buffer specified with characters, or it may *change the input pointer*
394/// to point to a constant buffer with the data already in it (avoiding a
395/// copy). The caller is not allowed to modify the returned buffer pointer
396/// if an internal buffer is returned.
397unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
398 const SourceManager &SourceMgr,
399 const LangOptions &LangOpts, bool *Invalid) {
400 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(((int)Tok.getLength() >= 0 && "Token character range is bogus!"
) ? static_cast<void> (0) : __assert_fail ("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 400, __PRETTY_FUNCTION__))
;
401
402 const char *TokStart = nullptr;
403 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
404 if (Tok.is(tok::raw_identifier))
405 TokStart = Tok.getRawIdentifier().data();
406 else if (!Tok.hasUCN()) {
407 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
408 // Just return the string from the identifier table, which is very quick.
409 Buffer = II->getNameStart();
410 return II->getLength();
411 }
412 }
413
414 // NOTE: this can be checked even after testing for an IdentifierInfo.
415 if (Tok.isLiteral())
416 TokStart = Tok.getLiteralData();
417
418 if (!TokStart) {
419 // Compute the start of the token in the input lexer buffer.
420 bool CharDataInvalid = false;
421 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
422 if (Invalid)
423 *Invalid = CharDataInvalid;
424 if (CharDataInvalid) {
425 Buffer = "";
426 return 0;
427 }
428 }
429
430 // If this token contains nothing interesting, return it directly.
431 if (!Tok.needsCleaning()) {
432 Buffer = TokStart;
433 return Tok.getLength();
434 }
435
436 // Otherwise, hard case, relex the characters into the string.
437 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
438}
439
440/// MeasureTokenLength - Relex the token at the specified location and return
441/// its length in bytes in the input file. If the token needs cleaning (e.g.
442/// includes a trigraph or an escaped newline) then this count includes bytes
443/// that are part of that.
444unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
445 const SourceManager &SM,
446 const LangOptions &LangOpts) {
447 Token TheTok;
448 if (getRawToken(Loc, TheTok, SM, LangOpts))
449 return 0;
450 return TheTok.getLength();
451}
452
453/// Relex the token at the specified location.
454/// \returns true if there was a failure, false on success.
455bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
456 const SourceManager &SM,
457 const LangOptions &LangOpts,
458 bool IgnoreWhiteSpace) {
459 // TODO: this could be special cased for common tokens like identifiers, ')',
460 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
461 // all obviously single-char tokens. This could use
462 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
463 // something.
464
465 // If this comes from a macro expansion, we really do want the macro name, not
466 // the token this macro expanded to.
467 Loc = SM.getExpansionLoc(Loc);
468 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
469 bool Invalid = false;
470 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
471 if (Invalid)
472 return true;
473
474 const char *StrData = Buffer.data()+LocInfo.second;
475
476 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
477 return true;
478
479 // Create a lexer starting at the beginning of this token.
480 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
481 Buffer.begin(), StrData, Buffer.end());
482 TheLexer.SetCommentRetentionState(true);
483 TheLexer.LexFromRawLexer(Result);
484 return false;
485}
486
487/// Returns the pointer that points to the beginning of line that contains
488/// the given offset, or null if the offset if invalid.
489static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
490 const char *BufStart = Buffer.data();
491 if (Offset >= Buffer.size())
492 return nullptr;
493
494 const char *LexStart = BufStart + Offset;
495 for (; LexStart != BufStart; --LexStart) {
496 if (isVerticalWhitespace(LexStart[0]) &&
497 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
498 // LexStart should point at first character of logical line.
499 ++LexStart;
500 break;
501 }
502 }
503 return LexStart;
504}
505
506static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
507 const SourceManager &SM,
508 const LangOptions &LangOpts) {
509 assert(Loc.isFileID())((Loc.isFileID()) ? static_cast<void> (0) : __assert_fail
("Loc.isFileID()", "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 509, __PRETTY_FUNCTION__))
;
510 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
511 if (LocInfo.first.isInvalid())
512 return Loc;
513
514 bool Invalid = false;
515 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
516 if (Invalid)
517 return Loc;
518
519 // Back up from the current location until we hit the beginning of a line
520 // (or the buffer). We'll relex from that point.
521 const char *StrData = Buffer.data() + LocInfo.second;
522 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
523 if (!LexStart || LexStart == StrData)
524 return Loc;
525
526 // Create a lexer starting at the beginning of this token.
527 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
528 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
529 Buffer.end());
530 TheLexer.SetCommentRetentionState(true);
531
532 // Lex tokens until we find the token that contains the source location.
533 Token TheTok;
534 do {
535 TheLexer.LexFromRawLexer(TheTok);
536
537 if (TheLexer.getBufferLocation() > StrData) {
538 // Lexing this token has taken the lexer past the source location we're
539 // looking for. If the current token encompasses our source location,
540 // return the beginning of that token.
541 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
542 return TheTok.getLocation();
543
544 // We ended up skipping over the source location entirely, which means
545 // that it points into whitespace. We're done here.
546 break;
547 }
548 } while (TheTok.getKind() != tok::eof);
549
550 // We've passed our source location; just return the original source location.
551 return Loc;
552}
553
554SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
555 const SourceManager &SM,
556 const LangOptions &LangOpts) {
557 if (Loc.isFileID())
558 return getBeginningOfFileToken(Loc, SM, LangOpts);
559
560 if (!SM.isMacroArgExpansion(Loc))
561 return Loc;
562
563 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
564 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
565 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
566 std::pair<FileID, unsigned> BeginFileLocInfo =
567 SM.getDecomposedLoc(BeginFileLoc);
568 assert(FileLocInfo.first == BeginFileLocInfo.first &&((FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo
.second >= BeginFileLocInfo.second) ? static_cast<void>
(0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 569, __PRETTY_FUNCTION__))
569 FileLocInfo.second >= BeginFileLocInfo.second)((FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo
.second >= BeginFileLocInfo.second) ? static_cast<void>
(0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 569, __PRETTY_FUNCTION__))
;
570 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
571}
572
573namespace {
574
575enum PreambleDirectiveKind {
576 PDK_Skipped,
577 PDK_Unknown
578};
579
580} // namespace
581
582PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
583 const LangOptions &LangOpts,
584 unsigned MaxLines) {
585 // Create a lexer starting at the beginning of the file. Note that we use a
586 // "fake" file source location at offset 1 so that the lexer will track our
587 // position within the file.
588 const unsigned StartOffset = 1;
589 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
590 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
591 Buffer.end());
592 TheLexer.SetCommentRetentionState(true);
593
594 bool InPreprocessorDirective = false;
595 Token TheTok;
596 SourceLocation ActiveCommentLoc;
597
598 unsigned MaxLineOffset = 0;
599 if (MaxLines) {
600 const char *CurPtr = Buffer.begin();
601 unsigned CurLine = 0;
602 while (CurPtr != Buffer.end()) {
603 char ch = *CurPtr++;
604 if (ch == '\n') {
605 ++CurLine;
606 if (CurLine == MaxLines)
607 break;
608 }
609 }
610 if (CurPtr != Buffer.end())
611 MaxLineOffset = CurPtr - Buffer.begin();
612 }
613
614 do {
615 TheLexer.LexFromRawLexer(TheTok);
616
617 if (InPreprocessorDirective) {
618 // If we've hit the end of the file, we're done.
619 if (TheTok.getKind() == tok::eof) {
620 break;
621 }
622
623 // If we haven't hit the end of the preprocessor directive, skip this
624 // token.
625 if (!TheTok.isAtStartOfLine())
626 continue;
627
628 // We've passed the end of the preprocessor directive, and will look
629 // at this token again below.
630 InPreprocessorDirective = false;
631 }
632
633 // Keep track of the # of lines in the preamble.
634 if (TheTok.isAtStartOfLine()) {
635 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
636
637 // If we were asked to limit the number of lines in the preamble,
638 // and we're about to exceed that limit, we're done.
639 if (MaxLineOffset && TokOffset >= MaxLineOffset)
640 break;
641 }
642
643 // Comments are okay; skip over them.
644 if (TheTok.getKind() == tok::comment) {
645 if (ActiveCommentLoc.isInvalid())
646 ActiveCommentLoc = TheTok.getLocation();
647 continue;
648 }
649
650 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
651 // This is the start of a preprocessor directive.
652 Token HashTok = TheTok;
653 InPreprocessorDirective = true;
654 ActiveCommentLoc = SourceLocation();
655
656 // Figure out which directive this is. Since we're lexing raw tokens,
657 // we don't have an identifier table available. Instead, just look at
658 // the raw identifier to recognize and categorize preprocessor directives.
659 TheLexer.LexFromRawLexer(TheTok);
660 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
661 StringRef Keyword = TheTok.getRawIdentifier();
662 PreambleDirectiveKind PDK
663 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
664 .Case("include", PDK_Skipped)
665 .Case("__include_macros", PDK_Skipped)
666 .Case("define", PDK_Skipped)
667 .Case("undef", PDK_Skipped)
668 .Case("line", PDK_Skipped)
669 .Case("error", PDK_Skipped)
670 .Case("pragma", PDK_Skipped)
671 .Case("import", PDK_Skipped)
672 .Case("include_next", PDK_Skipped)
673 .Case("warning", PDK_Skipped)
674 .Case("ident", PDK_Skipped)
675 .Case("sccs", PDK_Skipped)
676 .Case("assert", PDK_Skipped)
677 .Case("unassert", PDK_Skipped)
678 .Case("if", PDK_Skipped)
679 .Case("ifdef", PDK_Skipped)
680 .Case("ifndef", PDK_Skipped)
681 .Case("elif", PDK_Skipped)
682 .Case("else", PDK_Skipped)
683 .Case("endif", PDK_Skipped)
684 .Default(PDK_Unknown);
685
686 switch (PDK) {
687 case PDK_Skipped:
688 continue;
689
690 case PDK_Unknown:
691 // We don't know what this directive is; stop at the '#'.
692 break;
693 }
694 }
695
696 // We only end up here if we didn't recognize the preprocessor
697 // directive or it was one that can't occur in the preamble at this
698 // point. Roll back the current token to the location of the '#'.
699 TheTok = HashTok;
700 }
701
702 // We hit a token that we don't recognize as being in the
703 // "preprocessing only" part of the file, so we're no longer in
704 // the preamble.
705 break;
706 } while (true);
707
708 SourceLocation End;
709 if (ActiveCommentLoc.isValid())
710 End = ActiveCommentLoc; // don't truncate a decl comment.
711 else
712 End = TheTok.getLocation();
713
714 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
715 TheTok.isAtStartOfLine());
716}
717
718unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
719 const SourceManager &SM,
720 const LangOptions &LangOpts) {
721 // Figure out how many physical characters away the specified expansion
722 // character is. This needs to take into consideration newlines and
723 // trigraphs.
724 bool Invalid = false;
725 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
726
727 // If they request the first char of the token, we're trivially done.
728 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
729 return 0;
730
731 unsigned PhysOffset = 0;
732
733 // The usual case is that tokens don't contain anything interesting. Skip
734 // over the uninteresting characters. If a token only consists of simple
735 // chars, this method is extremely fast.
736 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
737 if (CharNo == 0)
738 return PhysOffset;
739 ++TokPtr;
740 --CharNo;
741 ++PhysOffset;
742 }
743
744 // If we have a character that may be a trigraph or escaped newline, use a
745 // lexer to parse it correctly.
746 for (; CharNo; --CharNo) {
747 unsigned Size;
748 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
749 TokPtr += Size;
750 PhysOffset += Size;
751 }
752
753 // Final detail: if we end up on an escaped newline, we want to return the
754 // location of the actual byte of the token. For example foo\<newline>bar
755 // advanced by 3 should return the location of b, not of \\. One compounding
756 // detail of this is that the escape may be made by a trigraph.
757 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
758 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
759
760 return PhysOffset;
761}
762
763/// Computes the source location just past the end of the
764/// token at this source location.
765///
766/// This routine can be used to produce a source location that
767/// points just past the end of the token referenced by \p Loc, and
768/// is generally used when a diagnostic needs to point just after a
769/// token where it expected something different that it received. If
770/// the returned source location would not be meaningful (e.g., if
771/// it points into a macro), this routine returns an invalid
772/// source location.
773///
774/// \param Offset an offset from the end of the token, where the source
775/// location should refer to. The default offset (0) produces a source
776/// location pointing just past the end of the token; an offset of 1 produces
777/// a source location pointing to the last character in the token, etc.
778SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
779 const SourceManager &SM,
780 const LangOptions &LangOpts) {
781 if (Loc.isInvalid())
782 return {};
783
784 if (Loc.isMacroID()) {
785 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
786 return {}; // Points inside the macro expansion.
787 }
788
789 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
790 if (Len > Offset)
791 Len = Len - Offset;
792 else
793 return Loc;
794
795 return Loc.getLocWithOffset(Len);
796}
797
798/// Returns true if the given MacroID location points at the first
799/// token of the macro expansion.
800bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
801 const SourceManager &SM,
802 const LangOptions &LangOpts,
803 SourceLocation *MacroBegin) {
804 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")((loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"
) ? static_cast<void> (0) : __assert_fail ("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 804, __PRETTY_FUNCTION__))
;
805
806 SourceLocation expansionLoc;
807 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
808 return false;
809
810 if (expansionLoc.isFileID()) {
811 // No other macro expansions, this is the first.
812 if (MacroBegin)
813 *MacroBegin = expansionLoc;
814 return true;
815 }
816
817 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
818}
819
820/// Returns true if the given MacroID location points at the last
821/// token of the macro expansion.
822bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
823 const SourceManager &SM,
824 const LangOptions &LangOpts,
825 SourceLocation *MacroEnd) {
826 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")((loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"
) ? static_cast<void> (0) : __assert_fail ("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 826, __PRETTY_FUNCTION__))
;
827
828 SourceLocation spellLoc = SM.getSpellingLoc(loc);
829 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
830 if (tokLen == 0)
831 return false;
832
833 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
834 SourceLocation expansionLoc;
835 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
836 return false;
837
838 if (expansionLoc.isFileID()) {
839 // No other macro expansions.
840 if (MacroEnd)
841 *MacroEnd = expansionLoc;
842 return true;
843 }
844
845 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
846}
847
848static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
849 const SourceManager &SM,
850 const LangOptions &LangOpts) {
851 SourceLocation Begin = Range.getBegin();
852 SourceLocation End = Range.getEnd();
853 assert(Begin.isFileID() && End.isFileID())((Begin.isFileID() && End.isFileID()) ? static_cast<
void> (0) : __assert_fail ("Begin.isFileID() && End.isFileID()"
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 853, __PRETTY_FUNCTION__))
;
854 if (Range.isTokenRange()) {
855 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
856 if (End.isInvalid())
857 return {};
858 }
859
860 // Break down the source locations.
861 FileID FID;
862 unsigned BeginOffs;
863 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
864 if (FID.isInvalid())
865 return {};
866
867 unsigned EndOffs;
868 if (!SM.isInFileID(End, FID, &EndOffs) ||
869 BeginOffs > EndOffs)
870 return {};
871
872 return CharSourceRange::getCharRange(Begin, End);
873}
874
875CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
876 const SourceManager &SM,
877 const LangOptions &LangOpts) {
878 SourceLocation Begin = Range.getBegin();
879 SourceLocation End = Range.getEnd();
880 if (Begin.isInvalid() || End.isInvalid())
881 return {};
882
883 if (Begin.isFileID() && End.isFileID())
884 return makeRangeFromFileLocs(Range, SM, LangOpts);
885
886 if (Begin.isMacroID() && End.isFileID()) {
887 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
888 return {};
889 Range.setBegin(Begin);
890 return makeRangeFromFileLocs(Range, SM, LangOpts);
891 }
892
893 if (Begin.isFileID() && End.isMacroID()) {
894 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
895 &End)) ||
896 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
897 &End)))
898 return {};
899 Range.setEnd(End);
900 return makeRangeFromFileLocs(Range, SM, LangOpts);
901 }
902
903 assert(Begin.isMacroID() && End.isMacroID())((Begin.isMacroID() && End.isMacroID()) ? static_cast
<void> (0) : __assert_fail ("Begin.isMacroID() && End.isMacroID()"
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 903, __PRETTY_FUNCTION__))
;
904 SourceLocation MacroBegin, MacroEnd;
905 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
906 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
907 &MacroEnd)) ||
908 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
909 &MacroEnd)))) {
910 Range.setBegin(MacroBegin);
911 Range.setEnd(MacroEnd);
912 return makeRangeFromFileLocs(Range, SM, LangOpts);
913 }
914
915 bool Invalid = false;
916 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
917 &Invalid);
918 if (Invalid)
919 return {};
920
921 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
922 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
923 &Invalid);
924 if (Invalid)
925 return {};
926
927 if (EndEntry.getExpansion().isMacroArgExpansion() &&
928 BeginEntry.getExpansion().getExpansionLocStart() ==
929 EndEntry.getExpansion().getExpansionLocStart()) {
930 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
931 Range.setEnd(SM.getImmediateSpellingLoc(End));
932 return makeFileCharRange(Range, SM, LangOpts);
933 }
934 }
935
936 return {};
937}
938
939StringRef Lexer::getSourceText(CharSourceRange Range,
940 const SourceManager &SM,
941 const LangOptions &LangOpts,
942 bool *Invalid) {
943 Range = makeFileCharRange(Range, SM, LangOpts);
944 if (Range.isInvalid()) {
945 if (Invalid) *Invalid = true;
946 return {};
947 }
948
949 // Break down the source location.
950 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
951 if (beginInfo.first.isInvalid()) {
952 if (Invalid) *Invalid = true;
953 return {};
954 }
955
956 unsigned EndOffs;
957 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
958 beginInfo.second > EndOffs) {
959 if (Invalid) *Invalid = true;
960 return {};
961 }
962
963 // Try to the load the file buffer.
964 bool invalidTemp = false;
965 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
966 if (invalidTemp) {
967 if (Invalid) *Invalid = true;
968 return {};
969 }
970
971 if (Invalid) *Invalid = false;
972 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
973}
974
975StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
976 const SourceManager &SM,
977 const LangOptions &LangOpts) {
978 assert(Loc.isMacroID() && "Only reasonable to call this on macros")((Loc.isMacroID() && "Only reasonable to call this on macros"
) ? static_cast<void> (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 978, __PRETTY_FUNCTION__))
;
979
980 // Find the location of the immediate macro expansion.
981 while (true) {
982 FileID FID = SM.getFileID(Loc);
983 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
984 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
985 Loc = Expansion.getExpansionLocStart();
986 if (!Expansion.isMacroArgExpansion())
987 break;
988
989 // For macro arguments we need to check that the argument did not come
990 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
991
992 // Loc points to the argument id of the macro definition, move to the
993 // macro expansion.
994 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
995 SourceLocation SpellLoc = Expansion.getSpellingLoc();
996 if (SpellLoc.isFileID())
997 break; // No inner macro.
998
999 // If spelling location resides in the same FileID as macro expansion
1000 // location, it means there is no inner macro.
1001 FileID MacroFID = SM.getFileID(Loc);
1002 if (SM.isInFileID(SpellLoc, MacroFID))
1003 break;
1004
1005 // Argument came from inner macro.
1006 Loc = SpellLoc;
1007 }
1008
1009 // Find the spelling location of the start of the non-argument expansion
1010 // range. This is where the macro name was spelled in order to begin
1011 // expanding this macro.
1012 Loc = SM.getSpellingLoc(Loc);
1013
1014 // Dig out the buffer where the macro name was spelled and the extents of the
1015 // name so that we can render it into the expansion note.
1016 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1017 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1018 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1019 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1020}
1021
1022StringRef Lexer::getImmediateMacroNameForDiagnostics(
1023 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1024 assert(Loc.isMacroID() && "Only reasonable to call this on macros")((Loc.isMacroID() && "Only reasonable to call this on macros"
) ? static_cast<void> (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 1024, __PRETTY_FUNCTION__))
;
1025 // Walk past macro argument expansions.
1026 while (SM.isMacroArgExpansion(Loc))
1027 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1028
1029 // If the macro's spelling has no FileID, then it's actually a token paste
1030 // or stringization (or similar) and not a macro at all.
1031 if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1032 return {};
1033
1034 // Find the spelling location of the start of the non-argument expansion
1035 // range. This is where the macro name was spelled in order to begin
1036 // expanding this macro.
1037 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1038
1039 // Dig out the buffer where the macro name was spelled and the extents of the
1040 // name so that we can render it into the expansion note.
1041 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1042 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1043 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1044 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1045}
1046
1047bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1048 return isIdentifierBody(c, LangOpts.DollarIdents);
1049}
1050
1051bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1052 assert(isVerticalWhitespace(Str[0]))((isVerticalWhitespace(Str[0])) ? static_cast<void> (0)
: __assert_fail ("isVerticalWhitespace(Str[0])", "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 1052, __PRETTY_FUNCTION__))
;
1053 if (Str - 1 < BufferStart)
1054 return false;
1055
1056 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1057 (Str[0] == '\r' && Str[-1] == '\n')) {
1058 if (Str - 2 < BufferStart)
1059 return false;
1060 --Str;
1061 }
1062 --Str;
1063
1064 // Rewind to first non-space character:
1065 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1066 --Str;
1067
1068 return *Str == '\\';
1069}
1070
1071StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1072 const SourceManager &SM) {
1073 if (Loc.isInvalid() || Loc.isMacroID())
1074 return {};
1075 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1076 if (LocInfo.first.isInvalid())
1077 return {};
1078 bool Invalid = false;
1079 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1080 if (Invalid)
1081 return {};
1082 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1083 if (!Line)
1084 return {};
1085 StringRef Rest = Buffer.substr(Line - Buffer.data());
1086 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1087 return NumWhitespaceChars == StringRef::npos
1088 ? ""
1089 : Rest.take_front(NumWhitespaceChars);
1090}
1091
1092//===----------------------------------------------------------------------===//
1093// Diagnostics forwarding code.
1094//===----------------------------------------------------------------------===//
1095
1096/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1097/// lexer buffer was all expanded at a single point, perform the mapping.
1098/// This is currently only used for _Pragma implementation, so it is the slow
1099/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1100static LLVM_ATTRIBUTE_NOINLINE__attribute__((noinline)) SourceLocation GetMappedTokenLoc(
1101 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1102static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1103 SourceLocation FileLoc,
1104 unsigned CharNo, unsigned TokLen) {
1105 assert(FileLoc.isMacroID() && "Must be a macro expansion")((FileLoc.isMacroID() && "Must be a macro expansion")
? static_cast<void> (0) : __assert_fail ("FileLoc.isMacroID() && \"Must be a macro expansion\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 1105, __PRETTY_FUNCTION__))
;
1106
1107 // Otherwise, we're lexing "mapped tokens". This is used for things like
1108 // _Pragma handling. Combine the expansion location of FileLoc with the
1109 // spelling location.
1110 SourceManager &SM = PP.getSourceManager();
1111
1112 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1113 // characters come from spelling(FileLoc)+Offset.
1114 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1115 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1116
1117 // Figure out the expansion loc range, which is the range covered by the
1118 // original _Pragma(...) sequence.
1119 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1120
1121 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1122}
1123
1124/// getSourceLocation - Return a source location identifier for the specified
1125/// offset in the current file.
1126SourceLocation Lexer::getSourceLocation(const char *Loc,
1127 unsigned TokLen) const {
1128 assert(Loc >= BufferStart && Loc <= BufferEnd &&((Loc >= BufferStart && Loc <= BufferEnd &&
"Location out of range for this buffer!") ? static_cast<void
> (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 1129, __PRETTY_FUNCTION__))
1129 "Location out of range for this buffer!")((Loc >= BufferStart && Loc <= BufferEnd &&
"Location out of range for this buffer!") ? static_cast<void
> (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 1129, __PRETTY_FUNCTION__))
;
1130
1131 // In the normal case, we're just lexing from a simple file buffer, return
1132 // the file id from FileLoc with the offset specified.
1133 unsigned CharNo = Loc-BufferStart;
1134 if (FileLoc.isFileID())
1135 return FileLoc.getLocWithOffset(CharNo);
1136
1137 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1138 // tokens are lexed from where the _Pragma was defined.
1139 assert(PP && "This doesn't work on raw lexers")((PP && "This doesn't work on raw lexers") ? static_cast
<void> (0) : __assert_fail ("PP && \"This doesn't work on raw lexers\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 1139, __PRETTY_FUNCTION__))
;
1140 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1141}
1142
1143/// Diag - Forwarding function for diagnostics. This translate a source
1144/// position in the current buffer into a SourceLocation object for rendering.
1145DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1146 return PP->Diag(getSourceLocation(Loc), DiagID);
1147}
1148
1149//===----------------------------------------------------------------------===//
1150// Trigraph and Escaped Newline Handling Code.
1151//===----------------------------------------------------------------------===//
1152
1153/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1154/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1155static char GetTrigraphCharForLetter(char Letter) {
1156 switch (Letter) {
1157 default: return 0;
1158 case '=': return '#';
1159 case ')': return ']';
1160 case '(': return '[';
1161 case '!': return '|';
1162 case '\'': return '^';
1163 case '>': return '}';
1164 case '/': return '\\';
1165 case '<': return '{';
1166 case '-': return '~';
1167 }
1168}
1169
1170/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1171/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1172/// return the result character. Finally, emit a warning about trigraph use
1173/// whether trigraphs are enabled or not.
1174static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1175 char Res = GetTrigraphCharForLetter(*CP);
1176 if (!Res || !L) return Res;
1177
1178 if (!L->getLangOpts().Trigraphs) {
1179 if (!L->isLexingRawMode())
1180 L->Diag(CP-2, diag::trigraph_ignored);
1181 return 0;
1182 }
1183
1184 if (!L->isLexingRawMode())
1185 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1186 return Res;
1187}
1188
1189/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1190/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1191/// trigraph equivalent on entry to this function.
1192unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1193 unsigned Size = 0;
1194 while (isWhitespace(Ptr[Size])) {
1195 ++Size;
1196
1197 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1198 continue;
1199
1200 // If this is a \r\n or \n\r, skip the other half.
1201 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1202 Ptr[Size-1] != Ptr[Size])
1203 ++Size;
1204
1205 return Size;
1206 }
1207
1208 // Not an escaped newline, must be a \t or something else.
1209 return 0;
1210}
1211
1212/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1213/// them), skip over them and return the first non-escaped-newline found,
1214/// otherwise return P.
1215const char *Lexer::SkipEscapedNewLines(const char *P) {
1216 while (true) {
1217 const char *AfterEscape;
1218 if (*P == '\\') {
1219 AfterEscape = P+1;
1220 } else if (*P == '?') {
1221 // If not a trigraph for escape, bail out.
1222 if (P[1] != '?' || P[2] != '/')
1223 return P;
1224 // FIXME: Take LangOpts into account; the language might not
1225 // support trigraphs.
1226 AfterEscape = P+3;
1227 } else {
1228 return P;
1229 }
1230
1231 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1232 if (NewLineSize == 0) return P;
1233 P = AfterEscape+NewLineSize;
1234 }
1235}
1236
1237Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1238 const SourceManager &SM,
1239 const LangOptions &LangOpts) {
1240 if (Loc.isMacroID()) {
1241 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1242 return None;
1243 }
1244 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1245
1246 // Break down the source location.
1247 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1248
1249 // Try to load the file buffer.
1250 bool InvalidTemp = false;
1251 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1252 if (InvalidTemp)
1253 return None;
1254
1255 const char *TokenBegin = File.data() + LocInfo.second;
1256
1257 // Lex from the start of the given location.
1258 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1259 TokenBegin, File.end());
1260 // Find the token.
1261 Token Tok;
1262 lexer.LexFromRawLexer(Tok);
1263 return Tok;
1264}
1265
1266/// Checks that the given token is the first token that occurs after the
1267/// given location (this excludes comments and whitespace). Returns the location
1268/// immediately after the specified token. If the token is not found or the
1269/// location is inside a macro, the returned source location will be invalid.
1270SourceLocation Lexer::findLocationAfterToken(
1271 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1272 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1273 Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1274 if (!Tok || Tok->isNot(TKind))
1275 return {};
1276 SourceLocation TokenLoc = Tok->getLocation();
1277
1278 // Calculate how much whitespace needs to be skipped if any.
1279 unsigned NumWhitespaceChars = 0;
1280 if (SkipTrailingWhitespaceAndNewLine) {
1281 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1282 unsigned char C = *TokenEnd;
1283 while (isHorizontalWhitespace(C)) {
1284 C = *(++TokenEnd);
1285 NumWhitespaceChars++;
1286 }
1287
1288 // Skip \r, \n, \r\n, or \n\r
1289 if (C == '\n' || C == '\r') {
1290 char PrevC = C;
1291 C = *(++TokenEnd);
1292 NumWhitespaceChars++;
1293 if ((C == '\n' || C == '\r') && C != PrevC)
1294 NumWhitespaceChars++;
1295 }
1296 }
1297
1298 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1299}
1300
1301/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1302/// get its size, and return it. This is tricky in several cases:
1303/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1304/// then either return the trigraph (skipping 3 chars) or the '?',
1305/// depending on whether trigraphs are enabled or not.
1306/// 2. If this is an escaped newline (potentially with whitespace between
1307/// the backslash and newline), implicitly skip the newline and return
1308/// the char after it.
1309///
1310/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1311/// know that we can accumulate into Size, and that we have already incremented
1312/// Ptr by Size bytes.
1313///
1314/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1315/// be updated to match.
1316char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1317 Token *Tok) {
1318 // If we have a slash, look for an escaped newline.
1319 if (Ptr[0] == '\\') {
8
Taking true branch
1320 ++Size;
1321 ++Ptr;
1322Slash:
1323 // Common case, backslash-char where the char is not whitespace.
1324 if (!isWhitespace(Ptr[0])) return '\\';
9
Assuming the condition is false
10
Taking false branch
1325
1326 // See if we have optional whitespace characters between the slash and
1327 // newline.
1328 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
11
Assuming 'EscapedNewLineSize' is not equal to 0
12
Taking true branch
1329 // Remember that this token needs to be cleaned.
1330 if (Tok
12.1
'Tok' is non-null
12.1
'Tok' is non-null
12.1
'Tok' is non-null
) Tok->setFlag(Token::NeedsCleaning);
13
Taking true branch
14
Calling 'Token::setFlag'
1331
1332 // Warn if there was whitespace between the backslash and newline.
1333 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1334 Diag(Ptr, diag::backslash_newline_space);
1335
1336 // Found backslash<whitespace><newline>. Parse the char after it.
1337 Size += EscapedNewLineSize;
1338 Ptr += EscapedNewLineSize;
1339
1340 // Use slow version to accumulate a correct size field.
1341 return getCharAndSizeSlow(Ptr, Size, Tok);
1342 }
1343
1344 // Otherwise, this is not an escaped newline, just return the slash.
1345 return '\\';
1346 }
1347
1348 // If this is a trigraph, process it.
1349 if (Ptr[0] == '?' && Ptr[1] == '?') {
1350 // If this is actually a legal trigraph (not something like "??x"), emit
1351 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1352 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1353 // Remember that this token needs to be cleaned.
1354 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1355
1356 Ptr += 3;
1357 Size += 3;
1358 if (C == '\\') goto Slash;
1359 return C;
1360 }
1361 }
1362
1363 // If this is neither, return a single character.
1364 ++Size;
1365 return *Ptr;
1366}
1367
1368/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1369/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1370/// and that we have already incremented Ptr by Size bytes.
1371///
1372/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1373/// be updated to match.
1374char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1375 const LangOptions &LangOpts) {
1376 // If we have a slash, look for an escaped newline.
1377 if (Ptr[0] == '\\') {
1378 ++Size;
1379 ++Ptr;
1380Slash:
1381 // Common case, backslash-char where the char is not whitespace.
1382 if (!isWhitespace(Ptr[0])) return '\\';
1383
1384 // See if we have optional whitespace characters followed by a newline.
1385 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1386 // Found backslash<whitespace><newline>. Parse the char after it.
1387 Size += EscapedNewLineSize;
1388 Ptr += EscapedNewLineSize;
1389
1390 // Use slow version to accumulate a correct size field.
1391 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1392 }
1393
1394 // Otherwise, this is not an escaped newline, just return the slash.
1395 return '\\';
1396 }
1397
1398 // If this is a trigraph, process it.
1399 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1400 // If this is actually a legal trigraph (not something like "??x"), return
1401 // it.
1402 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1403 Ptr += 3;
1404 Size += 3;
1405 if (C == '\\') goto Slash;
1406 return C;
1407 }
1408 }
1409
1410 // If this is neither, return a single character.
1411 ++Size;
1412 return *Ptr;
1413}
1414
1415//===----------------------------------------------------------------------===//
1416// Helper methods for lexing.
1417//===----------------------------------------------------------------------===//
1418
1419/// Routine that indiscriminately sets the offset into the source file.
1420void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1421 BufferPtr = BufferStart + Offset;
1422 if (BufferPtr > BufferEnd)
1423 BufferPtr = BufferEnd;
1424 // FIXME: What exactly does the StartOfLine bit mean? There are two
1425 // possible meanings for the "start" of the line: the first token on the
1426 // unexpanded line, or the first token on the expanded line.
1427 IsAtStartOfLine = StartOfLine;
1428 IsAtPhysicalStartOfLine = StartOfLine;
1429}
1430
1431static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1432 if (LangOpts.AsmPreprocessor) {
1433 return false;
1434 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1435 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1436 C11AllowedIDCharRanges);
1437 return C11AllowedIDChars.contains(C);
1438 } else if (LangOpts.CPlusPlus) {
1439 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1440 CXX03AllowedIDCharRanges);
1441 return CXX03AllowedIDChars.contains(C);
1442 } else {
1443 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1444 C99AllowedIDCharRanges);
1445 return C99AllowedIDChars.contains(C);
1446 }
1447}
1448
1449static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1450 assert(isAllowedIDChar(C, LangOpts))((isAllowedIDChar(C, LangOpts)) ? static_cast<void> (0)
: __assert_fail ("isAllowedIDChar(C, LangOpts)", "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 1450, __PRETTY_FUNCTION__))
;
1451 if (LangOpts.AsmPreprocessor) {
1452 return false;
1453 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1454 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1455 C11DisallowedInitialIDCharRanges);
1456 return !C11DisallowedInitialIDChars.contains(C);
1457 } else if (LangOpts.CPlusPlus) {
1458 return true;
1459 } else {
1460 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1461 C99DisallowedInitialIDCharRanges);
1462 return !C99DisallowedInitialIDChars.contains(C);
1463 }
1464}
1465
1466static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1467 const char *End) {
1468 return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1469 L.getSourceLocation(End));
1470}
1471
1472static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1473 CharSourceRange Range, bool IsFirst) {
1474 // Check C99 compatibility.
1475 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1476 enum {
1477 CannotAppearInIdentifier = 0,
1478 CannotStartIdentifier
1479 };
1480
1481 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1482 C99AllowedIDCharRanges);
1483 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1484 C99DisallowedInitialIDCharRanges);
1485 if (!C99AllowedIDChars.contains(C)) {
1486 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1487 << Range
1488 << CannotAppearInIdentifier;
1489 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1490 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1491 << Range
1492 << CannotStartIdentifier;
1493 }
1494 }
1495
1496 // Check C++98 compatibility.
1497 if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1498 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1499 CXX03AllowedIDCharRanges);
1500 if (!CXX03AllowedIDChars.contains(C)) {
1501 Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1502 << Range;
1503 }
1504 }
1505}
1506
1507/// After encountering UTF-8 character C and interpreting it as an identifier
1508/// character, check whether it's a homoglyph for a common non-identifier
1509/// source character that is unlikely to be an intentional identifier
1510/// character and warn if so.
1511static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1512 CharSourceRange Range) {
1513 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1514 struct HomoglyphPair {
1515 uint32_t Character;
1516 char LooksLike;
1517 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1518 };
1519 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1520 {U'\u00ad', 0}, // SOFT HYPHEN
1521 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1522 {U'\u037e', ';'}, // GREEK QUESTION MARK
1523 {U'\u200b', 0}, // ZERO WIDTH SPACE
1524 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1525 {U'\u200d', 0}, // ZERO WIDTH JOINER
1526 {U'\u2060', 0}, // WORD JOINER
1527 {U'\u2061', 0}, // FUNCTION APPLICATION
1528 {U'\u2062', 0}, // INVISIBLE TIMES
1529 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1530 {U'\u2064', 0}, // INVISIBLE PLUS
1531 {U'\u2212', '-'}, // MINUS SIGN
1532 {U'\u2215', '/'}, // DIVISION SLASH
1533 {U'\u2216', '\\'}, // SET MINUS
1534 {U'\u2217', '*'}, // ASTERISK OPERATOR
1535 {U'\u2223', '|'}, // DIVIDES
1536 {U'\u2227', '^'}, // LOGICAL AND
1537 {U'\u2236', ':'}, // RATIO
1538 {U'\u223c', '~'}, // TILDE OPERATOR
1539 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1540 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1541 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1542 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1543 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1544 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1545 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1546 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1547 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1548 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1549 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1550 {U'\uff0c', ','}, // FULLWIDTH COMMA
1551 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1552 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1553 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1554 {U'\uff1a', ':'}, // FULLWIDTH COLON
1555 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1556 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1557 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1558 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1559 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1560 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1561 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1562 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1563 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1564 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1565 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1566 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1567 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1568 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1569 {0, 0}
1570 };
1571 auto Homoglyph =
1572 std::lower_bound(std::begin(SortedHomoglyphs),
1573 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1574 if (Homoglyph->Character == C) {
1575 llvm::SmallString<5> CharBuf;
1576 {
1577 llvm::raw_svector_ostream CharOS(CharBuf);
1578 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1579 }
1580 if (Homoglyph->LooksLike) {
1581 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1582 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1583 << Range << CharBuf << LooksLikeStr;
1584 } else {
1585 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1586 << Range << CharBuf;
1587 }
1588 }
1589}
1590
1591bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1592 Token &Result) {
1593 const char *UCNPtr = CurPtr + Size;
1594 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1595 if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
1596 return false;
1597
1598 if (!isLexingRawMode())
1599 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1600 makeCharRange(*this, CurPtr, UCNPtr),
1601 /*IsFirst=*/false);
1602
1603 Result.setFlag(Token::HasUCN);
1604 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1605 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1606 CurPtr = UCNPtr;
1607 else
1608 while (CurPtr != UCNPtr)
1609 (void)getAndAdvanceChar(CurPtr, Result);
1610 return true;
1611}
1612
1613bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1614 const char *UnicodePtr = CurPtr;
1615 llvm::UTF32 CodePoint;
1616 llvm::ConversionResult Result =
1617 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1618 (const llvm::UTF8 *)BufferEnd,
1619 &CodePoint,
1620 llvm::strictConversion);
1621 if (Result != llvm::conversionOK ||
1622 !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
1623 return false;
1624
1625 if (!isLexingRawMode()) {
1626 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1627 makeCharRange(*this, CurPtr, UnicodePtr),
1628 /*IsFirst=*/false);
1629 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1630 makeCharRange(*this, CurPtr, UnicodePtr));
1631 }
1632
1633 CurPtr = UnicodePtr;
1634 return true;
1635}
1636
1637bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1638 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1639 unsigned Size;
1640 unsigned char C = *CurPtr++;
1641 while (isIdentifierBody(C))
1642 C = *CurPtr++;
1643
1644 --CurPtr; // Back up over the skipped character.
1645
1646 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
1647 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1648 //
1649 // TODO: Could merge these checks into an InfoTable flag to make the
1650 // comparison cheaper
1651 if (isASCII(C) && C != '\\' && C != '?' &&
1652 (C != '$' || !LangOpts.DollarIdents)) {
1653FinishIdentifier:
1654 const char *IdStart = BufferPtr;
1655 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1656 Result.setRawIdentifierData(IdStart);
1657
1658 // If we are in raw mode, return this identifier raw. There is no need to
1659 // look up identifier information or attempt to macro expand it.
1660 if (LexingRawMode)
1661 return true;
1662
1663 // Fill in Result.IdentifierInfo and update the token kind,
1664 // looking up the identifier in the identifier table.
1665 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1666 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1667 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1668
1669 // If the completion point is at the end of an identifier, we want to treat
1670 // the identifier as incomplete even if it resolves to a macro or a keyword.
1671 // This allows e.g. 'class^' to complete to 'classifier'.
1672 if (isCodeCompletionPoint(CurPtr)) {
1673 // Return the code-completion token.
1674 Result.setKind(tok::code_completion);
1675 // Skip the code-completion char and all immediate identifier characters.
1676 // This ensures we get consistent behavior when completing at any point in
1677 // an identifier (i.e. at the start, in the middle, at the end). Note that
1678 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1679 // simpler.
1680 assert(*CurPtr == 0 && "Completion character must be 0")((*CurPtr == 0 && "Completion character must be 0") ?
static_cast<void> (0) : __assert_fail ("*CurPtr == 0 && \"Completion character must be 0\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 1680, __PRETTY_FUNCTION__))
;
1681 ++CurPtr;
1682 // Note that code completion token is not added as a separate character
1683 // when the completion point is at the end of the buffer. Therefore, we need
1684 // to check if the buffer has ended.
1685 if (CurPtr < BufferEnd) {
1686 while (isIdentifierBody(*CurPtr))
1687 ++CurPtr;
1688 }
1689 BufferPtr = CurPtr;
1690 return true;
1691 }
1692
1693 // Finally, now that we know we have an identifier, pass this off to the
1694 // preprocessor, which may macro expand it or something.
1695 if (II->isHandleIdentifierCase())
1696 return PP->HandleIdentifier(Result);
1697
1698 return true;
1699 }
1700
1701 // Otherwise, $,\,? in identifier found. Enter slower path.
1702
1703 C = getCharAndSize(CurPtr, Size);
1704 while (true) {
1705 if (C == '$') {
1706 // If we hit a $ and they are not supported in identifiers, we are done.
1707 if (!LangOpts.DollarIdents) goto FinishIdentifier;
1708
1709 // Otherwise, emit a diagnostic and continue.
1710 if (!isLexingRawMode())
1711 Diag(CurPtr, diag::ext_dollar_in_identifier);
1712 CurPtr = ConsumeChar(CurPtr, Size, Result);
1713 C = getCharAndSize(CurPtr, Size);
1714 continue;
1715 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
1716 C = getCharAndSize(CurPtr, Size);
1717 continue;
1718 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
1719 C = getCharAndSize(CurPtr, Size);
1720 continue;
1721 } else if (!isIdentifierBody(C)) {
1722 goto FinishIdentifier;
1723 }
1724
1725 // Otherwise, this character is good, consume it.
1726 CurPtr = ConsumeChar(CurPtr, Size, Result);
1727
1728 C = getCharAndSize(CurPtr, Size);
1729 while (isIdentifierBody(C)) {
1730 CurPtr = ConsumeChar(CurPtr, Size, Result);
1731 C = getCharAndSize(CurPtr, Size);
1732 }
1733 }
1734}
1735
1736/// isHexaLiteral - Return true if Start points to a hex constant.
1737/// in microsoft mode (where this is supposed to be several different tokens).
1738bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1739 unsigned Size;
1740 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1741 if (C1 != '0')
1742 return false;
1743 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1744 return (C2 == 'x' || C2 == 'X');
1745}
1746
1747/// LexNumericConstant - Lex the remainder of a integer or floating point
1748/// constant. From[-1] is the first character lexed. Return the end of the
1749/// constant.
1750bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1751 unsigned Size;
1752 char C = getCharAndSize(CurPtr, Size);
1753 char PrevCh = 0;
1754 while (isPreprocessingNumberBody(C)) {
1755 CurPtr = ConsumeChar(CurPtr, Size, Result);
1756 PrevCh = C;
1757 C = getCharAndSize(CurPtr, Size);
1758 }
1759
1760 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1761 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1762 // If we are in Microsoft mode, don't continue if the constant is hex.
1763 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1764 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1765 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1766 }
1767
1768 // If we have a hex FP constant, continue.
1769 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1770 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1771 // not-quite-conforming extension. Only do so if this looks like it's
1772 // actually meant to be a hexfloat, and not if it has a ud-suffix.
1773 bool IsHexFloat = true;
1774 if (!LangOpts.C99) {
1775 if (!isHexaLiteral(BufferPtr, LangOpts))
1776 IsHexFloat = false;
1777 else if (!getLangOpts().CPlusPlus17 &&
1778 std::find(BufferPtr, CurPtr, '_') != CurPtr)
1779 IsHexFloat = false;
1780 }
1781 if (IsHexFloat)
1782 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1783 }
1784
1785 // If we have a digit separator, continue.
1786 if (C == '\'' && getLangOpts().CPlusPlus14) {
1787 unsigned NextSize;
1788 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1789 if (isIdentifierBody(Next)) {
1790 if (!isLexingRawMode())
1791 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1792 CurPtr = ConsumeChar(CurPtr, Size, Result);
1793 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1794 return LexNumericConstant(Result, CurPtr);
1795 }
1796 }
1797
1798 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1799 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1800 return LexNumericConstant(Result, CurPtr);
1801 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1802 return LexNumericConstant(Result, CurPtr);
1803
1804 // Update the location of token as well as BufferPtr.
1805 const char *TokStart = BufferPtr;
1806 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1807 Result.setLiteralData(TokStart);
1808 return true;
1809}
1810
1811/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1812/// in C++11, or warn on a ud-suffix in C++98.
1813const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1814 bool IsStringLiteral) {
1815 assert(getLangOpts().CPlusPlus)((getLangOpts().CPlusPlus) ? static_cast<void> (0) : __assert_fail
("getLangOpts().CPlusPlus", "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 1815, __PRETTY_FUNCTION__))
;
1816
1817 // Maximally munch an identifier.
1818 unsigned Size;
1819 char C = getCharAndSize(CurPtr, Size);
1820 bool Consumed = false;
1821
1822 if (!isIdentifierHead(C)) {
1823 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1824 Consumed = true;
1825 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1826 Consumed = true;
1827 else
1828 return CurPtr;
1829 }
1830
1831 if (!getLangOpts().CPlusPlus11) {
1832 if (!isLexingRawMode())
1833 Diag(CurPtr,
1834 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1835 : diag::warn_cxx11_compat_reserved_user_defined_literal)
1836 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1837 return CurPtr;
1838 }
1839
1840 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1841 // that does not start with an underscore is ill-formed. As a conforming
1842 // extension, we treat all such suffixes as if they had whitespace before
1843 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1844 // likely to be a ud-suffix than a macro, however, and accept that.
1845 if (!Consumed) {
1846 bool IsUDSuffix = false;
1847 if (C == '_')
1848 IsUDSuffix = true;
1849 else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
1850 // In C++1y, we need to look ahead a few characters to see if this is a
1851 // valid suffix for a string literal or a numeric literal (this could be
1852 // the 'operator""if' defining a numeric literal operator).
1853 const unsigned MaxStandardSuffixLength = 3;
1854 char Buffer[MaxStandardSuffixLength] = { C };
1855 unsigned Consumed = Size;
1856 unsigned Chars = 1;
1857 while (true) {
1858 unsigned NextSize;
1859 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1860 getLangOpts());
1861 if (!isIdentifierBody(Next)) {
1862 // End of suffix. Check whether this is on the whitelist.
1863 const StringRef CompleteSuffix(Buffer, Chars);
1864 IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1865 CompleteSuffix);
1866 break;
1867 }
1868
1869 if (Chars == MaxStandardSuffixLength)
1870 // Too long: can't be a standard suffix.
1871 break;
1872
1873 Buffer[Chars++] = Next;
1874 Consumed += NextSize;
1875 }
1876 }
1877
1878 if (!IsUDSuffix) {
1879 if (!isLexingRawMode())
1880 Diag(CurPtr, getLangOpts().MSVCCompat
1881 ? diag::ext_ms_reserved_user_defined_literal
1882 : diag::ext_reserved_user_defined_literal)
1883 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1884 return CurPtr;
1885 }
1886
1887 CurPtr = ConsumeChar(CurPtr, Size, Result);
1888 }
1889
1890 Result.setFlag(Token::HasUDSuffix);
1891 while (true) {
1892 C = getCharAndSize(CurPtr, Size);
1893 if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
1894 else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
1895 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
1896 else break;
1897 }
1898
1899 return CurPtr;
1900}
1901
1902/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1903/// either " or L" or u8" or u" or U".
1904bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1905 tok::TokenKind Kind) {
1906 const char *AfterQuote = CurPtr;
1907 // Does this string contain the \0 character?
1908 const char *NulCharacter = nullptr;
1909
1910 if (!isLexingRawMode() &&
1911 (Kind == tok::utf8_string_literal ||
1912 Kind == tok::utf16_string_literal ||
1913 Kind == tok::utf32_string_literal))
1914 Diag(BufferPtr, getLangOpts().CPlusPlus
1915 ? diag::warn_cxx98_compat_unicode_literal
1916 : diag::warn_c99_compat_unicode_literal);
1917
1918 char C = getAndAdvanceChar(CurPtr, Result);
1919 while (C != '"') {
1920 // Skip escaped characters. Escaped newlines will already be processed by
1921 // getAndAdvanceChar.
1922 if (C == '\\')
1923 C = getAndAdvanceChar(CurPtr, Result);
1924
1925 if (C == '\n' || C == '\r' || // Newline.
1926 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
1927 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
1928 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1929 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1930 return true;
1931 }
1932
1933 if (C == 0) {
1934 if (isCodeCompletionPoint(CurPtr-1)) {
1935 if (ParsingFilename)
1936 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
1937 else
1938 PP->CodeCompleteNaturalLanguage();
1939 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
1940 cutOffLexing();
1941 return true;
1942 }
1943
1944 NulCharacter = CurPtr-1;
1945 }
1946 C = getAndAdvanceChar(CurPtr, Result);
1947 }
1948
1949 // If we are in C++11, lex the optional ud-suffix.
1950 if (getLangOpts().CPlusPlus)
1951 CurPtr = LexUDSuffix(Result, CurPtr, true);
1952
1953 // If a nul character existed in the string, warn about it.
1954 if (NulCharacter && !isLexingRawMode())
1955 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1956
1957 // Update the location of the token as well as the BufferPtr instance var.
1958 const char *TokStart = BufferPtr;
1959 FormTokenWithChars(Result, CurPtr, Kind);
1960 Result.setLiteralData(TokStart);
1961 return true;
1962}
1963
1964/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1965/// having lexed R", LR", u8R", uR", or UR".
1966bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1967 tok::TokenKind Kind) {
1968 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1969 // Between the initial and final double quote characters of the raw string,
1970 // any transformations performed in phases 1 and 2 (trigraphs,
1971 // universal-character-names, and line splicing) are reverted.
1972
1973 if (!isLexingRawMode())
1974 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1975
1976 unsigned PrefixLen = 0;
1977
1978 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
1979 ++PrefixLen;
1980
1981 // If the last character was not a '(', then we didn't lex a valid delimiter.
1982 if (CurPtr[PrefixLen] != '(') {
1983 if (!isLexingRawMode()) {
1984 const char *PrefixEnd = &CurPtr[PrefixLen];
1985 if (PrefixLen == 16) {
1986 Diag(PrefixEnd, diag::err_raw_delim_too_long);
1987 } else {
1988 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1989 << StringRef(PrefixEnd, 1);
1990 }
1991 }
1992
1993 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1994 // it's possible the '"' was intended to be part of the raw string, but
1995 // there's not much we can do about that.
1996 while (true) {
1997 char C = *CurPtr++;
1998
1999 if (C == '"')
2000 break;
2001 if (C == 0 && CurPtr-1 == BufferEnd) {
2002 --CurPtr;
2003 break;
2004 }
2005 }
2006
2007 FormTokenWithChars(Result, CurPtr, tok::unknown);
2008 return true;
2009 }
2010
2011 // Save prefix and move CurPtr past it
2012 const char *Prefix = CurPtr;
2013 CurPtr += PrefixLen + 1; // skip over prefix and '('
2014
2015 while (true) {
2016 char C = *CurPtr++;
2017
2018 if (C == ')') {
2019 // Check for prefix match and closing quote.
2020 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2021 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2022 break;
2023 }
2024 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2025 if (!isLexingRawMode())
2026 Diag(BufferPtr, diag::err_unterminated_raw_string)
2027 << StringRef(Prefix, PrefixLen);
2028 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2029 return true;
2030 }
2031 }
2032
2033 // If we are in C++11, lex the optional ud-suffix.
2034 if (getLangOpts().CPlusPlus)
2035 CurPtr = LexUDSuffix(Result, CurPtr, true);
2036
2037 // Update the location of token as well as BufferPtr.
2038 const char *TokStart = BufferPtr;
2039 FormTokenWithChars(Result, CurPtr, Kind);
2040 Result.setLiteralData(TokStart);
2041 return true;
2042}
2043
2044/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2045/// after having lexed the '<' character. This is used for #include filenames.
2046bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2047 // Does this string contain the \0 character?
2048 const char *NulCharacter = nullptr;
2049 const char *AfterLessPos = CurPtr;
2050 char C = getAndAdvanceChar(CurPtr, Result);
2051 while (C != '>') {
2052 // Skip escaped characters. Escaped newlines will already be processed by
2053 // getAndAdvanceChar.
2054 if (C == '\\')
2055 C = getAndAdvanceChar(CurPtr, Result);
2056
2057 if (C == '\n' || C == '\r' || // Newline.
2058 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2059 // If the filename is unterminated, then it must just be a lone <
2060 // character. Return this as such.
2061 FormTokenWithChars(Result, AfterLessPos, tok::less);
2062 return true;
2063 }
2064
2065 if (C == 0) {
2066 if (isCodeCompletionPoint(CurPtr - 1)) {
2067 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2068 cutOffLexing();
2069 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2070 return true;
2071 }
2072 NulCharacter = CurPtr-1;
2073 }
2074 C = getAndAdvanceChar(CurPtr, Result);
2075 }
2076
2077 // If a nul character existed in the string, warn about it.
2078 if (NulCharacter && !isLexingRawMode())
2079 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2080
2081 // Update the location of token as well as BufferPtr.
2082 const char *TokStart = BufferPtr;
2083 FormTokenWithChars(Result, CurPtr, tok::header_name);
2084 Result.setLiteralData(TokStart);
2085 return true;
2086}
2087
2088void Lexer::codeCompleteIncludedFile(const char *PathStart,
2089 const char *CompletionPoint,
2090 bool IsAngled) {
2091 // Completion only applies to the filename, after the last slash.
2092 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2093 auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/");
2094 StringRef Dir =
2095 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2096 const char *StartOfFilename =
2097 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2098 // Code completion filter range is the filename only, up to completion point.
2099 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2100 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2101 // We should replace the characters up to the closing quote, if any.
2102 while (CompletionPoint < BufferEnd) {
2103 char Next = *(CompletionPoint + 1);
2104 if (Next == 0 || Next == '\r' || Next == '\n')
2105 break;
2106 ++CompletionPoint;
2107 if (Next == (IsAngled ? '>' : '"'))
2108 break;
2109 }
2110 PP->setCodeCompletionTokenRange(
2111 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2112 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2113 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2114}
2115
2116/// LexCharConstant - Lex the remainder of a character constant, after having
2117/// lexed either ' or L' or u8' or u' or U'.
2118bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2119 tok::TokenKind Kind) {
2120 // Does this character contain the \0 character?
2121 const char *NulCharacter = nullptr;
2122
2123 if (!isLexingRawMode()) {
2124 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2125 Diag(BufferPtr, getLangOpts().CPlusPlus
2126 ? diag::warn_cxx98_compat_unicode_literal
2127 : diag::warn_c99_compat_unicode_literal);
2128 else if (Kind == tok::utf8_char_constant)
2129 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2130 }
2131
2132 char C = getAndAdvanceChar(CurPtr, Result);
2133 if (C == '\'') {
2134 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2135 Diag(BufferPtr, diag::ext_empty_character);
2136 FormTokenWithChars(Result, CurPtr, tok::unknown);
2137 return true;
2138 }
2139
2140 while (C != '\'') {
2141 // Skip escaped characters.
2142 if (C == '\\')
2143 C = getAndAdvanceChar(CurPtr, Result);
2144
2145 if (C == '\n' || C == '\r' || // Newline.
2146 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2147 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2148 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2149 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2150 return true;
2151 }
2152
2153 if (C == 0) {
2154 if (isCodeCompletionPoint(CurPtr-1)) {
2155 PP->CodeCompleteNaturalLanguage();
2156 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2157 cutOffLexing();
2158 return true;
2159 }
2160
2161 NulCharacter = CurPtr-1;
2162 }
2163 C = getAndAdvanceChar(CurPtr, Result);
2164 }
2165
2166 // If we are in C++11, lex the optional ud-suffix.
2167 if (getLangOpts().CPlusPlus)
2168 CurPtr = LexUDSuffix(Result, CurPtr, false);
2169
2170 // If a nul character existed in the character, warn about it.
2171 if (NulCharacter && !isLexingRawMode())
2172 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2173
2174 // Update the location of token as well as BufferPtr.
2175 const char *TokStart = BufferPtr;
2176 FormTokenWithChars(Result, CurPtr, Kind);
2177 Result.setLiteralData(TokStart);
2178 return true;
2179}
2180
2181/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2182/// Update BufferPtr to point to the next non-whitespace character and return.
2183///
2184/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2185bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2186 bool &TokAtPhysicalStartOfLine) {
2187 // Whitespace - Skip it, then return the token after the whitespace.
2188 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2189
2190 unsigned char Char = *CurPtr;
2191
2192 // Skip consecutive spaces efficiently.
2193 while (true) {
2194 // Skip horizontal whitespace very aggressively.
2195 while (isHorizontalWhitespace(Char))
2196 Char = *++CurPtr;
2197
2198 // Otherwise if we have something other than whitespace, we're done.
2199 if (!isVerticalWhitespace(Char))
2200 break;
2201
2202 if (ParsingPreprocessorDirective) {
2203 // End of preprocessor directive line, let LexTokenInternal handle this.
2204 BufferPtr = CurPtr;
2205 return false;
2206 }
2207
2208 // OK, but handle newline.
2209 SawNewline = true;
2210 Char = *++CurPtr;
2211 }
2212
2213 // If the client wants us to return whitespace, return it now.
2214 if (isKeepWhitespaceMode()) {
2215 FormTokenWithChars(Result, CurPtr, tok::unknown);
2216 if (SawNewline) {
2217 IsAtStartOfLine = true;
2218 IsAtPhysicalStartOfLine = true;
2219 }
2220 // FIXME: The next token will not have LeadingSpace set.
2221 return true;
2222 }
2223
2224 // If this isn't immediately after a newline, there is leading space.
2225 char PrevChar = CurPtr[-1];
2226 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2227
2228 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2229 if (SawNewline) {
2230 Result.setFlag(Token::StartOfLine);
2231 TokAtPhysicalStartOfLine = true;
2232 }
2233
2234 BufferPtr = CurPtr;
2235 return false;
2236}
2237
2238/// We have just read the // characters from input. Skip until we find the
2239/// newline character that terminates the comment. Then update BufferPtr and
2240/// return.
2241///
2242/// If we're in KeepCommentMode or any CommentHandler has inserted
2243/// some tokens, this will store the first token and return true.
2244bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2245 bool &TokAtPhysicalStartOfLine) {
2246 // If Line comments aren't explicitly enabled for this language, emit an
2247 // extension warning.
2248 if (!LangOpts.LineComment && !isLexingRawMode()) {
2249 Diag(BufferPtr, diag::ext_line_comment);
2250
2251 // Mark them enabled so we only emit one warning for this translation
2252 // unit.
2253 LangOpts.LineComment = true;
2254 }
2255
2256 // Scan over the body of the comment. The common case, when scanning, is that
2257 // the comment contains normal ascii characters with nothing interesting in
2258 // them. As such, optimize for this case with the inner loop.
2259 //
2260 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2261 // character that ends the line comment.
2262 char C;
2263 while (true) {
2264 C = *CurPtr;
2265 // Skip over characters in the fast loop.
2266 while (C != 0 && // Potentially EOF.
2267 C != '\n' && C != '\r') // Newline or DOS-style newline.
2268 C = *++CurPtr;
2269
2270 const char *NextLine = CurPtr;
2271 if (C != 0) {
2272 // We found a newline, see if it's escaped.
2273 const char *EscapePtr = CurPtr-1;
2274 bool HasSpace = false;
2275 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2276 --EscapePtr;
2277 HasSpace = true;
2278 }
2279
2280 if (*EscapePtr == '\\')
2281 // Escaped newline.
2282 CurPtr = EscapePtr;
2283 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2284 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2285 // Trigraph-escaped newline.
2286 CurPtr = EscapePtr-2;
2287 else
2288 break; // This is a newline, we're done.
2289
2290 // If there was space between the backslash and newline, warn about it.
2291 if (HasSpace && !isLexingRawMode())
2292 Diag(EscapePtr, diag::backslash_newline_space);
2293 }
2294
2295 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2296 // properly decode the character. Read it in raw mode to avoid emitting
2297 // diagnostics about things like trigraphs. If we see an escaped newline,
2298 // we'll handle it below.
2299 const char *OldPtr = CurPtr;
2300 bool OldRawMode = isLexingRawMode();
2301 LexingRawMode = true;
2302 C = getAndAdvanceChar(CurPtr, Result);
2303 LexingRawMode = OldRawMode;
2304
2305 // If we only read only one character, then no special handling is needed.
2306 // We're done and can skip forward to the newline.
2307 if (C != 0 && CurPtr == OldPtr+1) {
2308 CurPtr = NextLine;
2309 break;
2310 }
2311
2312 // If we read multiple characters, and one of those characters was a \r or
2313 // \n, then we had an escaped newline within the comment. Emit diagnostic
2314 // unless the next line is also a // comment.
2315 if (CurPtr != OldPtr + 1 && C != '/' &&
2316 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2317 for (; OldPtr != CurPtr; ++OldPtr)
2318 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2319 // Okay, we found a // comment that ends in a newline, if the next
2320 // line is also a // comment, but has spaces, don't emit a diagnostic.
2321 if (isWhitespace(C)) {
2322 const char *ForwardPtr = CurPtr;
2323 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2324 ++ForwardPtr;
2325 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2326 break;
2327 }
2328
2329 if (!isLexingRawMode())
2330 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2331 break;
2332 }
2333 }
2334
2335 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2336 --CurPtr;
2337 break;
2338 }
2339
2340 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2341 PP->CodeCompleteNaturalLanguage();
2342 cutOffLexing();
2343 return false;
2344 }
2345 }
2346
2347 // Found but did not consume the newline. Notify comment handlers about the
2348 // comment unless we're in a #if 0 block.
2349 if (PP && !isLexingRawMode() &&
2350 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2351 getSourceLocation(CurPtr)))) {
2352 BufferPtr = CurPtr;
2353 return true; // A token has to be returned.
2354 }
2355
2356 // If we are returning comments as tokens, return this comment as a token.
2357 if (inKeepCommentMode())
2358 return SaveLineComment(Result, CurPtr);
2359
2360 // If we are inside a preprocessor directive and we see the end of line,
2361 // return immediately, so that the lexer can return this as an EOD token.
2362 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2363 BufferPtr = CurPtr;
2364 return false;
2365 }
2366
2367 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2368 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2369 // contribute to another token), it isn't needed for correctness. Note that
2370 // this is ok even in KeepWhitespaceMode, because we would have returned the
2371 /// comment above in that mode.
2372 ++CurPtr;
2373
2374 // The next returned token is at the start of the line.
2375 Result.setFlag(Token::StartOfLine);
2376 TokAtPhysicalStartOfLine = true;
2377 // No leading whitespace seen so far.
2378 Result.clearFlag(Token::LeadingSpace);
2379 BufferPtr = CurPtr;
2380 return false;
2381}
2382
2383/// If in save-comment mode, package up this Line comment in an appropriate
2384/// way and return it.
2385bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2386 // If we're not in a preprocessor directive, just return the // comment
2387 // directly.
2388 FormTokenWithChars(Result, CurPtr, tok::comment);
2389
2390 if (!ParsingPreprocessorDirective || LexingRawMode)
2391 return true;
2392
2393 // If this Line-style comment is in a macro definition, transmogrify it into
2394 // a C-style block comment.
2395 bool Invalid = false;
2396 std::string Spelling = PP->getSpelling(Result, &Invalid);
2397 if (Invalid)
2398 return true;
2399
2400 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?")((Spelling[0] == '/' && Spelling[1] == '/' &&
"Not line comment?") ? static_cast<void> (0) : __assert_fail
("Spelling[0] == '/' && Spelling[1] == '/' && \"Not line comment?\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 2400, __PRETTY_FUNCTION__))
;
2401 Spelling[1] = '*'; // Change prefix to "/*".
2402 Spelling += "*/"; // add suffix.
2403
2404 Result.setKind(tok::comment);
2405 PP->CreateString(Spelling, Result,
2406 Result.getLocation(), Result.getLocation());
2407 return true;
2408}
2409
2410/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2411/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2412/// a diagnostic if so. We know that the newline is inside of a block comment.
2413static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2414 Lexer *L) {
2415 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r')((CurPtr[0] == '\n' || CurPtr[0] == '\r') ? static_cast<void
> (0) : __assert_fail ("CurPtr[0] == '\\n' || CurPtr[0] == '\\r'"
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 2415, __PRETTY_FUNCTION__))
;
2416
2417 // Back up off the newline.
2418 --CurPtr;
2419
2420 // If this is a two-character newline sequence, skip the other character.
2421 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2422 // \n\n or \r\r -> not escaped newline.
2423 if (CurPtr[0] == CurPtr[1])
2424 return false;
2425 // \n\r or \r\n -> skip the newline.
2426 --CurPtr;
2427 }
2428
2429 // If we have horizontal whitespace, skip over it. We allow whitespace
2430 // between the slash and newline.
2431 bool HasSpace = false;
2432 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2433 --CurPtr;
2434 HasSpace = true;
2435 }
2436
2437 // If we have a slash, we know this is an escaped newline.
2438 if (*CurPtr == '\\') {
2439 if (CurPtr[-1] != '*') return false;
2440 } else {
2441 // It isn't a slash, is it the ?? / trigraph?
2442 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
2443 CurPtr[-3] != '*')
2444 return false;
2445
2446 // This is the trigraph ending the comment. Emit a stern warning!
2447 CurPtr -= 2;
2448
2449 // If no trigraphs are enabled, warn that we ignored this trigraph and
2450 // ignore this * character.
2451 if (!L->getLangOpts().Trigraphs) {
2452 if (!L->isLexingRawMode())
2453 L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2454 return false;
2455 }
2456 if (!L->isLexingRawMode())
2457 L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2458 }
2459
2460 // Warn about having an escaped newline between the */ characters.
2461 if (!L->isLexingRawMode())
2462 L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2463
2464 // If there was space between the backslash and newline, warn about it.
2465 if (HasSpace && !L->isLexingRawMode())
2466 L->Diag(CurPtr, diag::backslash_newline_space);
2467
2468 return true;
2469}
2470
2471#ifdef __SSE2__1
2472#include <emmintrin.h>
2473#elif __ALTIVEC__
2474#include <altivec.h>
2475#undef bool
2476#endif
2477
2478/// We have just read from input the / and * characters that started a comment.
2479/// Read until we find the * and / characters that terminate the comment.
2480/// Note that we don't bother decoding trigraphs or escaped newlines in block
2481/// comments, because they cannot cause the comment to end. The only thing
2482/// that can happen is the comment could end with an escaped newline between
2483/// the terminating * and /.
2484///
2485/// If we're in KeepCommentMode or any CommentHandler has inserted
2486/// some tokens, this will store the first token and return true.
2487bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2488 bool &TokAtPhysicalStartOfLine) {
2489 // Scan one character past where we should, looking for a '/' character. Once
2490 // we find it, check to see if it was preceded by a *. This common
2491 // optimization helps people who like to put a lot of * characters in their
2492 // comments.
2493
2494 // The first character we get with newlines and trigraphs skipped to handle
2495 // the degenerate /*/ case below correctly if the * has an escaped newline
2496 // after it.
2497 unsigned CharSize;
2498 unsigned char C = getCharAndSize(CurPtr, CharSize);
2499 CurPtr += CharSize;
2500 if (C == 0 && CurPtr == BufferEnd+1) {
2501 if (!isLexingRawMode())
2502 Diag(BufferPtr, diag::err_unterminated_block_comment);
2503 --CurPtr;
2504
2505 // KeepWhitespaceMode should return this broken comment as a token. Since
2506 // it isn't a well formed comment, just return it as an 'unknown' token.
2507 if (isKeepWhitespaceMode()) {
2508 FormTokenWithChars(Result, CurPtr, tok::unknown);
2509 return true;
2510 }
2511
2512 BufferPtr = CurPtr;
2513 return false;
2514 }
2515
2516 // Check to see if the first character after the '/*' is another /. If so,
2517 // then this slash does not end the block comment, it is part of it.
2518 if (C == '/')
2519 C = *CurPtr++;
2520
2521 while (true) {
2522 // Skip over all non-interesting characters until we find end of buffer or a
2523 // (probably ending) '/' character.
2524 if (CurPtr + 24 < BufferEnd &&
2525 // If there is a code-completion point avoid the fast scan because it
2526 // doesn't check for '\0'.
2527 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2528 // While not aligned to a 16-byte boundary.
2529 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
2530 C = *CurPtr++;
2531
2532 if (C == '/') goto FoundSlash;
2533
2534#ifdef __SSE2__1
2535 __m128i Slashes = _mm_set1_epi8('/');
2536 while (CurPtr+16 <= BufferEnd) {
2537 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2538 Slashes));
2539 if (cmp != 0) {
2540 // Adjust the pointer to point directly after the first slash. It's
2541 // not necessary to set C here, it will be overwritten at the end of
2542 // the outer loop.
2543 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2544 goto FoundSlash;
2545 }
2546 CurPtr += 16;
2547 }
2548#elif __ALTIVEC__
2549 __vector unsigned char Slashes = {
2550 '/', '/', '/', '/', '/', '/', '/', '/',
2551 '/', '/', '/', '/', '/', '/', '/', '/'
2552 };
2553 while (CurPtr+16 <= BufferEnd &&
2554 !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes))
2555 CurPtr += 16;
2556#else
2557 // Scan for '/' quickly. Many block comments are very large.
2558 while (CurPtr[0] != '/' &&
2559 CurPtr[1] != '/' &&
2560 CurPtr[2] != '/' &&
2561 CurPtr[3] != '/' &&
2562 CurPtr+4 < BufferEnd) {
2563 CurPtr += 4;
2564 }
2565#endif
2566
2567 // It has to be one of the bytes scanned, increment to it and read one.
2568 C = *CurPtr++;
2569 }
2570
2571 // Loop to scan the remainder.
2572 while (C != '/' && C != '\0')
2573 C = *CurPtr++;
2574
2575 if (C == '/') {
2576 FoundSlash:
2577 if (CurPtr[-2] == '*') // We found the final */. We're done!
2578 break;
2579
2580 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2581 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2582 // We found the final */, though it had an escaped newline between the
2583 // * and /. We're done!
2584 break;
2585 }
2586 }
2587 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2588 // If this is a /* inside of the comment, emit a warning. Don't do this
2589 // if this is a /*/, which will end the comment. This misses cases with
2590 // embedded escaped newlines, but oh well.
2591 if (!isLexingRawMode())
2592 Diag(CurPtr-1, diag::warn_nested_block_comment);
2593 }
2594 } else if (C == 0 && CurPtr == BufferEnd+1) {
2595 if (!isLexingRawMode())
2596 Diag(BufferPtr, diag::err_unterminated_block_comment);
2597 // Note: the user probably forgot a */. We could continue immediately
2598 // after the /*, but this would involve lexing a lot of what really is the
2599 // comment, which surely would confuse the parser.
2600 --CurPtr;
2601
2602 // KeepWhitespaceMode should return this broken comment as a token. Since
2603 // it isn't a well formed comment, just return it as an 'unknown' token.
2604 if (isKeepWhitespaceMode()) {
2605 FormTokenWithChars(Result, CurPtr, tok::unknown);
2606 return true;
2607 }
2608
2609 BufferPtr = CurPtr;
2610 return false;
2611 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2612 PP->CodeCompleteNaturalLanguage();
2613 cutOffLexing();
2614 return false;
2615 }
2616
2617 C = *CurPtr++;
2618 }
2619
2620 // Notify comment handlers about the comment unless we're in a #if 0 block.
2621 if (PP && !isLexingRawMode() &&
2622 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2623 getSourceLocation(CurPtr)))) {
2624 BufferPtr = CurPtr;
2625 return true; // A token has to be returned.
2626 }
2627
2628 // If we are returning comments as tokens, return this comment as a token.
2629 if (inKeepCommentMode()) {
2630 FormTokenWithChars(Result, CurPtr, tok::comment);
2631 return true;
2632 }
2633
2634 // It is common for the tokens immediately after a /**/ comment to be
2635 // whitespace. Instead of going through the big switch, handle it
2636 // efficiently now. This is safe even in KeepWhitespaceMode because we would
2637 // have already returned above with the comment as a token.
2638 if (isHorizontalWhitespace(*CurPtr)) {
2639 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2640 return false;
2641 }
2642
2643 // Otherwise, just return so that the next character will be lexed as a token.
2644 BufferPtr = CurPtr;
2645 Result.setFlag(Token::LeadingSpace);
2646 return false;
2647}
2648
2649//===----------------------------------------------------------------------===//
2650// Primary Lexing Entry Points
2651//===----------------------------------------------------------------------===//
2652
2653/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2654/// uninterpreted string. This switches the lexer out of directive mode.
2655void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2656 assert(ParsingPreprocessorDirective && ParsingFilename == false &&((ParsingPreprocessorDirective && ParsingFilename == false
&& "Must be in a preprocessing directive!") ? static_cast
<void> (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 2657, __PRETTY_FUNCTION__))
1
Assuming field 'ParsingPreprocessorDirective' is true
2
Assuming the condition is true
3
'?' condition is true
2657 "Must be in a preprocessing directive!")((ParsingPreprocessorDirective && ParsingFilename == false
&& "Must be in a preprocessing directive!") ? static_cast
<void> (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 2657, __PRETTY_FUNCTION__))
;
2658 Token Tmp;
2659
2660 // CurPtr - Cache BufferPtr in an automatic variable.
2661 const char *CurPtr = BufferPtr;
2662 while (true) {
4
Loop condition is true. Entering loop body
2663 char Char = getAndAdvanceChar(CurPtr, Tmp);
5
Calling 'Lexer::getAndAdvanceChar'
2664 switch (Char) {
2665 default:
2666 if (Result)
2667 Result->push_back(Char);
2668 break;
2669 case 0: // Null.
2670 // Found end of file?
2671 if (CurPtr-1 != BufferEnd) {
2672 if (isCodeCompletionPoint(CurPtr-1)) {
2673 PP->CodeCompleteNaturalLanguage();
2674 cutOffLexing();
2675 return;
2676 }
2677
2678 // Nope, normal character, continue.
2679 if (Result)
2680 Result->push_back(Char);
2681 break;
2682 }
2683 // FALL THROUGH.
2684 LLVM_FALLTHROUGH[[gnu::fallthrough]];
2685 case '\r':
2686 case '\n':
2687 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2688 assert(CurPtr[-1] == Char && "Trigraphs for newline?")((CurPtr[-1] == Char && "Trigraphs for newline?") ? static_cast
<void> (0) : __assert_fail ("CurPtr[-1] == Char && \"Trigraphs for newline?\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 2688, __PRETTY_FUNCTION__))
;
2689 BufferPtr = CurPtr-1;
2690
2691 // Next, lex the character, which should handle the EOD transition.
2692 Lex(Tmp);
2693 if (Tmp.is(tok::code_completion)) {
2694 if (PP)
2695 PP->CodeCompleteNaturalLanguage();
2696 Lex(Tmp);
2697 }
2698 assert(Tmp.is(tok::eod) && "Unexpected token!")((Tmp.is(tok::eod) && "Unexpected token!") ? static_cast
<void> (0) : __assert_fail ("Tmp.is(tok::eod) && \"Unexpected token!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 2698, __PRETTY_FUNCTION__))
;
2699
2700 // Finally, we're done;
2701 return;
2702 }
2703 }
2704}
2705
2706/// LexEndOfFile - CurPtr points to the end of this file. Handle this
2707/// condition, reporting diagnostics and handling other edge cases as required.
2708/// This returns true if Result contains a token, false if PP.Lex should be
2709/// called again.
2710bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2711 // If we hit the end of the file while parsing a preprocessor directive,
2712 // end the preprocessor directive first. The next token returned will
2713 // then be the end of file.
2714 if (ParsingPreprocessorDirective) {
2715 // Done parsing the "line".
2716 ParsingPreprocessorDirective = false;
2717 // Update the location of token as well as BufferPtr.
2718 FormTokenWithChars(Result, CurPtr, tok::eod);
2719
2720 // Restore comment saving mode, in case it was disabled for directive.
2721 if (PP)
2722 resetExtendedTokenMode();
2723 return true; // Have a token.
2724 }
2725
2726 // If we are in raw mode, return this event as an EOF token. Let the caller
2727 // that put us in raw mode handle the event.
2728 if (isLexingRawMode()) {
2729 Result.startToken();
2730 BufferPtr = BufferEnd;
2731 FormTokenWithChars(Result, BufferEnd, tok::eof);
2732 return true;
2733 }
2734
2735 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
2736 PP->setRecordedPreambleConditionalStack(ConditionalStack);
2737 ConditionalStack.clear();
2738 }
2739
2740 // Issue diagnostics for unterminated #if and missing newline.
2741
2742 // If we are in a #if directive, emit an error.
2743 while (!ConditionalStack.empty()) {
2744 if (PP->getCodeCompletionFileLoc() != FileLoc)
2745 PP->Diag(ConditionalStack.back().IfLoc,
2746 diag::err_pp_unterminated_conditional);
2747 ConditionalStack.pop_back();
2748 }
2749
2750 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2751 // a pedwarn.
2752 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
2753 DiagnosticsEngine &Diags = PP->getDiagnostics();
2754 SourceLocation EndLoc = getSourceLocation(BufferEnd);
2755 unsigned DiagID;
2756
2757 if (LangOpts.CPlusPlus11) {
2758 // C++11 [lex.phases] 2.2 p2
2759 // Prefer the C++98 pedantic compatibility warning over the generic,
2760 // non-extension, user-requested "missing newline at EOF" warning.
2761 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2762 DiagID = diag::warn_cxx98_compat_no_newline_eof;
2763 } else {
2764 DiagID = diag::warn_no_newline_eof;
2765 }
2766 } else {
2767 DiagID = diag::ext_no_newline_eof;
2768 }
2769
2770 Diag(BufferEnd, DiagID)
2771 << FixItHint::CreateInsertion(EndLoc, "\n");
2772 }
2773
2774 BufferPtr = CurPtr;
2775
2776 // Finally, let the preprocessor handle this.
2777 return PP->HandleEndOfFile(Result, isPragmaLexer());
2778}
2779
2780/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2781/// the specified lexer will return a tok::l_paren token, 0 if it is something
2782/// else and 2 if there are no more tokens in the buffer controlled by the
2783/// lexer.
2784unsigned Lexer::isNextPPTokenLParen() {
2785 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?")((!LexingRawMode && "How can we expand a macro from a skipping buffer?"
) ? static_cast<void> (0) : __assert_fail ("!LexingRawMode && \"How can we expand a macro from a skipping buffer?\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 2785, __PRETTY_FUNCTION__))
;
2786
2787 // Switch to 'skipping' mode. This will ensure that we can lex a token
2788 // without emitting diagnostics, disables macro expansion, and will cause EOF
2789 // to return an EOF token instead of popping the include stack.
2790 LexingRawMode = true;
2791
2792 // Save state that can be changed while lexing so that we can restore it.
2793 const char *TmpBufferPtr = BufferPtr;
2794 bool inPPDirectiveMode = ParsingPreprocessorDirective;
2795 bool atStartOfLine = IsAtStartOfLine;
2796 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2797 bool leadingSpace = HasLeadingSpace;
2798
2799 Token Tok;
2800 Lex(Tok);
2801
2802 // Restore state that may have changed.
2803 BufferPtr = TmpBufferPtr;
2804 ParsingPreprocessorDirective = inPPDirectiveMode;
2805 HasLeadingSpace = leadingSpace;
2806 IsAtStartOfLine = atStartOfLine;
2807 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2808
2809 // Restore the lexer back to non-skipping mode.
2810 LexingRawMode = false;
2811
2812 if (Tok.is(tok::eof))
2813 return 2;
2814 return Tok.is(tok::l_paren);
2815}
2816
2817/// Find the end of a version control conflict marker.
2818static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2819 ConflictMarkerKind CMK) {
2820 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
2821 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
2822 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2823 size_t Pos = RestOfBuffer.find(Terminator);
2824 while (Pos != StringRef::npos) {
2825 // Must occur at start of line.
2826 if (Pos == 0 ||
2827 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
2828 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2829 Pos = RestOfBuffer.find(Terminator);
2830 continue;
2831 }
2832 return RestOfBuffer.data()+Pos;
2833 }
2834 return nullptr;
2835}
2836
2837/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2838/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2839/// and recover nicely. This returns true if it is a conflict marker and false
2840/// if not.
2841bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2842 // Only a conflict marker if it starts at the beginning of a line.
2843 if (CurPtr != BufferStart &&
2844 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2845 return false;
2846
2847 // Check to see if we have <<<<<<< or >>>>.
2848 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2849 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
2850 return false;
2851
2852 // If we have a situation where we don't care about conflict markers, ignore
2853 // it.
2854 if (CurrentConflictMarkerState || isLexingRawMode())
2855 return false;
2856
2857 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
2858
2859 // Check to see if there is an ending marker somewhere in the buffer at the
2860 // start of a line to terminate this conflict marker.
2861 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2862 // We found a match. We are really in a conflict marker.
2863 // Diagnose this, and ignore to the end of line.
2864 Diag(CurPtr, diag::err_conflict_marker);
2865 CurrentConflictMarkerState = Kind;
2866
2867 // Skip ahead to the end of line. We know this exists because the
2868 // end-of-conflict marker starts with \r or \n.
2869 while (*CurPtr != '\r' && *CurPtr != '\n') {
2870 assert(CurPtr != BufferEnd && "Didn't find end of line")((CurPtr != BufferEnd && "Didn't find end of line") ?
static_cast<void> (0) : __assert_fail ("CurPtr != BufferEnd && \"Didn't find end of line\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 2870, __PRETTY_FUNCTION__))
;
2871 ++CurPtr;
2872 }
2873 BufferPtr = CurPtr;
2874 return true;
2875 }
2876
2877 // No end of conflict marker found.
2878 return false;
2879}
2880
2881/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2882/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2883/// is the end of a conflict marker. Handle it by ignoring up until the end of
2884/// the line. This returns true if it is a conflict marker and false if not.
2885bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2886 // Only a conflict marker if it starts at the beginning of a line.
2887 if (CurPtr != BufferStart &&
2888 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2889 return false;
2890
2891 // If we have a situation where we don't care about conflict markers, ignore
2892 // it.
2893 if (!CurrentConflictMarkerState || isLexingRawMode())
2894 return false;
2895
2896 // Check to see if we have the marker (4 characters in a row).
2897 for (unsigned i = 1; i != 4; ++i)
2898 if (CurPtr[i] != CurPtr[0])
2899 return false;
2900
2901 // If we do have it, search for the end of the conflict marker. This could
2902 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
2903 // be the end of conflict marker.
2904 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2905 CurrentConflictMarkerState)) {
2906 CurPtr = End;
2907
2908 // Skip ahead to the end of line.
2909 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2910 ++CurPtr;
2911
2912 BufferPtr = CurPtr;
2913
2914 // No longer in the conflict marker.
2915 CurrentConflictMarkerState = CMK_None;
2916 return true;
2917 }
2918
2919 return false;
2920}
2921
2922static const char *findPlaceholderEnd(const char *CurPtr,
2923 const char *BufferEnd) {
2924 if (CurPtr == BufferEnd)
2925 return nullptr;
2926 BufferEnd -= 1; // Scan until the second last character.
2927 for (; CurPtr != BufferEnd; ++CurPtr) {
2928 if (CurPtr[0] == '#' && CurPtr[1] == '>')
2929 return CurPtr + 2;
2930 }
2931 return nullptr;
2932}
2933
2934bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2935 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!")((CurPtr[-1] == '<' && CurPtr[0] == '#' &&
"Not a placeholder!") ? static_cast<void> (0) : __assert_fail
("CurPtr[-1] == '<' && CurPtr[0] == '#' && \"Not a placeholder!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 2935, __PRETTY_FUNCTION__))
;
2936 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
2937 return false;
2938 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2939 if (!End)
2940 return false;
2941 const char *Start = CurPtr - 1;
2942 if (!LangOpts.AllowEditorPlaceholders)
2943 Diag(Start, diag::err_placeholder_in_source);
2944 Result.startToken();
2945 FormTokenWithChars(Result, End, tok::raw_identifier);
2946 Result.setRawIdentifierData(Start);
2947 PP->LookUpIdentifierInfo(Result);
2948 Result.setFlag(Token::IsEditorPlaceholder);
2949 BufferPtr = End;
2950 return true;
2951}
2952
2953bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2954 if (PP && PP->isCodeCompletionEnabled()) {
2955 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2956 return Loc == PP->getCodeCompletionLoc();
2957 }
2958
2959 return false;
2960}
2961
2962uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
2963 Token *Result) {
2964 unsigned CharSize;
2965 char Kind = getCharAndSize(StartPtr, CharSize);
2966
2967 unsigned NumHexDigits;
2968 if (Kind == 'u')
2969 NumHexDigits = 4;
2970 else if (Kind == 'U')
2971 NumHexDigits = 8;
2972 else
2973 return 0;
2974
2975 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
2976 if (Result && !isLexingRawMode())
2977 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2978 return 0;
2979 }
2980
2981 const char *CurPtr = StartPtr + CharSize;
2982 const char *KindLoc = &CurPtr[-1];
2983
2984 uint32_t CodePoint = 0;
2985 for (unsigned i = 0; i < NumHexDigits; ++i) {
2986 char C = getCharAndSize(CurPtr, CharSize);
2987
2988 unsigned Value = llvm::hexDigitValue(C);
2989 if (Value == -1U) {
2990 if (Result && !isLexingRawMode()) {
2991 if (i == 0) {
2992 Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2993 << StringRef(KindLoc, 1);
2994 } else {
2995 Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2996
2997 // If the user wrote \U1234, suggest a fixit to \u.
2998 if (i == 4 && NumHexDigits == 8) {
2999 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3000 Diag(KindLoc, diag::note_ucn_four_not_eight)
3001 << FixItHint::CreateReplacement(URange, "u");
3002 }
3003 }
3004 }
3005
3006 return 0;
3007 }
3008
3009 CodePoint <<= 4;
3010 CodePoint += Value;
3011
3012 CurPtr += CharSize;
3013 }
3014
3015 if (Result) {
3016 Result->setFlag(Token::HasUCN);
3017 if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
3018 StartPtr = CurPtr;
3019 else
3020 while (StartPtr != CurPtr)
3021 (void)getAndAdvanceChar(StartPtr, *Result);
3022 } else {
3023 StartPtr = CurPtr;
3024 }
3025
3026 // Don't apply C family restrictions to UCNs in assembly mode
3027 if (LangOpts.AsmPreprocessor)
3028 return CodePoint;
3029
3030 // C99 6.4.3p2: A universal character name shall not specify a character whose
3031 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3032 // 0060 (`), nor one in the range D800 through DFFF inclusive.)
3033 // C++11 [lex.charset]p2: If the hexadecimal value for a
3034 // universal-character-name corresponds to a surrogate code point (in the
3035 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3036 // if the hexadecimal value for a universal-character-name outside the
3037 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3038 // string literal corresponds to a control character (in either of the
3039 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3040 // basic source character set, the program is ill-formed.
3041 if (CodePoint < 0xA0) {
3042 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
3043 return CodePoint;
3044
3045 // We don't use isLexingRawMode() here because we need to warn about bad
3046 // UCNs even when skipping preprocessing tokens in a #if block.
3047 if (Result && PP) {
3048 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3049 Diag(BufferPtr, diag::err_ucn_control_character);
3050 else {
3051 char C = static_cast<char>(CodePoint);
3052 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3053 }
3054 }
3055
3056 return 0;
3057 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3058 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3059 // We don't use isLexingRawMode() here because we need to diagnose bad
3060 // UCNs even when skipping preprocessing tokens in a #if block.
3061 if (Result && PP) {
3062 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3063 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3064 else
3065 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3066 }
3067 return 0;
3068 }
3069
3070 return CodePoint;
3071}
3072
3073bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3074 const char *CurPtr) {
3075 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3076 UnicodeWhitespaceCharRanges);
3077 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3078 UnicodeWhitespaceChars.contains(C)) {
3079 Diag(BufferPtr, diag::ext_unicode_whitespace)
3080 << makeCharRange(*this, BufferPtr, CurPtr);
3081
3082 Result.setFlag(Token::LeadingSpace);
3083 return true;
3084 }
3085 return false;
3086}
3087
3088bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3089 if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
3090 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3091 !PP->isPreprocessedOutput()) {
3092 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3093 makeCharRange(*this, BufferPtr, CurPtr),
3094 /*IsFirst=*/true);
3095 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
3096 makeCharRange(*this, BufferPtr, CurPtr));
3097 }
3098
3099 MIOpt.ReadToken();
3100 return LexIdentifier(Result, CurPtr);
3101 }
3102
3103 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3104 !PP->isPreprocessedOutput() &&
3105 !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
3106 // Non-ASCII characters tend to creep into source code unintentionally.
3107 // Instead of letting the parser complain about the unknown token,
3108 // just drop the character.
3109 // Note that we can /only/ do this when the non-ASCII character is actually
3110 // spelled as Unicode, not written as a UCN. The standard requires that
3111 // we not throw away any possible preprocessor tokens, but there's a
3112 // loophole in the mapping of Unicode characters to basic character set
3113 // characters that allows us to map these particular characters to, say,
3114 // whitespace.
3115 Diag(BufferPtr, diag::err_non_ascii)
3116 << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3117
3118 BufferPtr = CurPtr;
3119 return false;
3120 }
3121
3122 // Otherwise, we have an explicit UCN or a character that's unlikely to show
3123 // up by accident.
3124 MIOpt.ReadToken();
3125 FormTokenWithChars(Result, CurPtr, tok::unknown);
3126 return true;
3127}
3128
3129void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3130 IsAtStartOfLine = Result.isAtStartOfLine();
3131 HasLeadingSpace = Result.hasLeadingSpace();
3132 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3133 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3134}
3135
3136bool Lexer::Lex(Token &Result) {
3137 // Start a new token.
3138 Result.startToken();
3139
3140 // Set up misc whitespace flags for LexTokenInternal.
3141 if (IsAtStartOfLine) {
3142 Result.setFlag(Token::StartOfLine);
3143 IsAtStartOfLine = false;
3144 }
3145
3146 if (HasLeadingSpace) {
3147 Result.setFlag(Token::LeadingSpace);
3148 HasLeadingSpace = false;
3149 }
3150
3151 if (HasLeadingEmptyMacro) {
3152 Result.setFlag(Token::LeadingEmptyMacro);
3153 HasLeadingEmptyMacro = false;
3154 }
3155
3156 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3157 IsAtPhysicalStartOfLine = false;
3158 bool isRawLex = isLexingRawMode();
3159 (void) isRawLex;
3160 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3161 // (After the LexTokenInternal call, the lexer might be destroyed.)
3162 assert((returnedToken || !isRawLex) && "Raw lex must succeed")(((returnedToken || !isRawLex) && "Raw lex must succeed"
) ? static_cast<void> (0) : __assert_fail ("(returnedToken || !isRawLex) && \"Raw lex must succeed\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 3162, __PRETTY_FUNCTION__))
;
3163 return returnedToken;
3164}
3165
3166/// LexTokenInternal - This implements a simple C family lexer. It is an
3167/// extremely performance critical piece of code. This assumes that the buffer
3168/// has a null character at the end of the file. This returns a preprocessing
3169/// token, not a normal token, as such, it is an internal interface. It assumes
3170/// that the Flags of result have been cleared before calling this.
3171bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3172LexNextToken:
3173 // New token, can't need cleaning yet.
3174 Result.clearFlag(Token::NeedsCleaning);
3175 Result.setIdentifierInfo(nullptr);
3176
3177 // CurPtr - Cache BufferPtr in an automatic variable.
3178 const char *CurPtr = BufferPtr;
3179
3180 // Small amounts of horizontal whitespace is very common between tokens.
3181 if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
3182 ++CurPtr;
3183 while ((*CurPtr == ' ') || (*CurPtr == '\t'))
3184 ++CurPtr;
3185
3186 // If we are keeping whitespace and other tokens, just return what we just
3187 // skipped. The next lexer invocation will return the token after the
3188 // whitespace.
3189 if (isKeepWhitespaceMode()) {
3190 FormTokenWithChars(Result, CurPtr, tok::unknown);
3191 // FIXME: The next token will not have LeadingSpace set.
3192 return true;
3193 }
3194
3195 BufferPtr = CurPtr;
3196 Result.setFlag(Token::LeadingSpace);
3197 }
3198
3199 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3200
3201 // Read a character, advancing over it.
3202 char Char = getAndAdvanceChar(CurPtr, Result);
3203 tok::TokenKind Kind;
3204
3205 switch (Char) {
3206 case 0: // Null.
3207 // Found end of file?
3208 if (CurPtr-1 == BufferEnd)
3209 return LexEndOfFile(Result, CurPtr-1);
3210
3211 // Check if we are performing code completion.
3212 if (isCodeCompletionPoint(CurPtr-1)) {
3213 // Return the code-completion token.
3214 Result.startToken();
3215 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3216 return true;
3217 }
3218
3219 if (!isLexingRawMode())
3220 Diag(CurPtr-1, diag::null_in_file);
3221 Result.setFlag(Token::LeadingSpace);
3222 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3223 return true; // KeepWhitespaceMode
3224
3225 // We know the lexer hasn't changed, so just try again with this lexer.
3226 // (We manually eliminate the tail call to avoid recursion.)
3227 goto LexNextToken;
3228
3229 case 26: // DOS & CP/M EOF: "^Z".
3230 // If we're in Microsoft extensions mode, treat this as end of file.
3231 if (LangOpts.MicrosoftExt) {
3232 if (!isLexingRawMode())
3233 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3234 return LexEndOfFile(Result, CurPtr-1);
3235 }
3236
3237 // If Microsoft extensions are disabled, this is just random garbage.
3238 Kind = tok::unknown;
3239 break;
3240
3241 case '\r':
3242 if (CurPtr[0] == '\n')
3243 (void)getAndAdvanceChar(CurPtr, Result);
3244 LLVM_FALLTHROUGH[[gnu::fallthrough]];
3245 case '\n':
3246 // If we are inside a preprocessor directive and we see the end of line,
3247 // we know we are done with the directive, so return an EOD token.
3248 if (ParsingPreprocessorDirective) {
3249 // Done parsing the "line".
3250 ParsingPreprocessorDirective = false;
3251
3252 // Restore comment saving mode, in case it was disabled for directive.
3253 if (PP)
3254 resetExtendedTokenMode();
3255
3256 // Since we consumed a newline, we are back at the start of a line.
3257 IsAtStartOfLine = true;
3258 IsAtPhysicalStartOfLine = true;
3259
3260 Kind = tok::eod;
3261 break;
3262 }
3263
3264 // No leading whitespace seen so far.
3265 Result.clearFlag(Token::LeadingSpace);
3266
3267 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3268 return true; // KeepWhitespaceMode
3269
3270 // We only saw whitespace, so just try again with this lexer.
3271 // (We manually eliminate the tail call to avoid recursion.)
3272 goto LexNextToken;
3273 case ' ':
3274 case '\t':
3275 case '\f':
3276 case '\v':
3277 SkipHorizontalWhitespace:
3278 Result.setFlag(Token::LeadingSpace);
3279 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3280 return true; // KeepWhitespaceMode
3281
3282 SkipIgnoredUnits:
3283 CurPtr = BufferPtr;
3284
3285 // If the next token is obviously a // or /* */ comment, skip it efficiently
3286 // too (without going through the big switch stmt).
3287 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3288 LangOpts.LineComment &&
3289 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3290 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3291 return true; // There is a token to return.
3292 goto SkipIgnoredUnits;
3293 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3294 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3295 return true; // There is a token to return.
3296 goto SkipIgnoredUnits;
3297 } else if (isHorizontalWhitespace(*CurPtr)) {
3298 goto SkipHorizontalWhitespace;
3299 }
3300 // We only saw whitespace, so just try again with this lexer.
3301 // (We manually eliminate the tail call to avoid recursion.)
3302 goto LexNextToken;
3303
3304 // C99 6.4.4.1: Integer Constants.
3305 // C99 6.4.4.2: Floating Constants.
3306 case '0': case '1': case '2': case '3': case '4':
3307 case '5': case '6': case '7': case '8': case '9':
3308 // Notify MIOpt that we read a non-whitespace/non-comment token.
3309 MIOpt.ReadToken();
3310 return LexNumericConstant(Result, CurPtr);
3311
3312 case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3313 // Notify MIOpt that we read a non-whitespace/non-comment token.
3314 MIOpt.ReadToken();
3315
3316 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3317 Char = getCharAndSize(CurPtr, SizeTmp);
3318
3319 // UTF-16 string literal
3320 if (Char == '"')
3321 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3322 tok::utf16_string_literal);
3323
3324 // UTF-16 character constant
3325 if (Char == '\'')
3326 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3327 tok::utf16_char_constant);
3328
3329 // UTF-16 raw string literal
3330 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3331 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3332 return LexRawStringLiteral(Result,
3333 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3334 SizeTmp2, Result),
3335 tok::utf16_string_literal);
3336
3337 if (Char == '8') {
3338 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3339
3340 // UTF-8 string literal
3341 if (Char2 == '"')
3342 return LexStringLiteral(Result,
3343 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3344 SizeTmp2, Result),
3345 tok::utf8_string_literal);
3346 if (Char2 == '\'' && LangOpts.CPlusPlus17)
3347 return LexCharConstant(
3348 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3349 SizeTmp2, Result),
3350 tok::utf8_char_constant);
3351
3352 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3353 unsigned SizeTmp3;
3354 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3355 // UTF-8 raw string literal
3356 if (Char3 == '"') {
3357 return LexRawStringLiteral(Result,
3358 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3359 SizeTmp2, Result),
3360 SizeTmp3, Result),
3361 tok::utf8_string_literal);
3362 }
3363 }
3364 }
3365 }
3366
3367 // treat u like the start of an identifier.
3368 return LexIdentifier(Result, CurPtr);
3369
3370 case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
3371 // Notify MIOpt that we read a non-whitespace/non-comment token.
3372 MIOpt.ReadToken();
3373
3374 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3375 Char = getCharAndSize(CurPtr, SizeTmp);
3376
3377 // UTF-32 string literal
3378 if (Char == '"')
3379 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3380 tok::utf32_string_literal);
3381
3382 // UTF-32 character constant
3383 if (Char == '\'')
3384 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3385 tok::utf32_char_constant);
3386
3387 // UTF-32 raw string literal
3388 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3389 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3390 return LexRawStringLiteral(Result,
3391 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3392 SizeTmp2, Result),
3393 tok::utf32_string_literal);
3394 }
3395
3396 // treat U like the start of an identifier.
3397 return LexIdentifier(Result, CurPtr);
3398
3399 case 'R': // Identifier or C++0x raw string literal
3400 // Notify MIOpt that we read a non-whitespace/non-comment token.
3401 MIOpt.ReadToken();
3402
3403 if (LangOpts.CPlusPlus11) {
3404 Char = getCharAndSize(CurPtr, SizeTmp);
3405
3406 if (Char == '"')
3407 return LexRawStringLiteral(Result,
3408 ConsumeChar(CurPtr, SizeTmp, Result),
3409 tok::string_literal);
3410 }
3411
3412 // treat R like the start of an identifier.
3413 return LexIdentifier(Result, CurPtr);
3414
3415 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3416 // Notify MIOpt that we read a non-whitespace/non-comment token.
3417 MIOpt.ReadToken();
3418 Char = getCharAndSize(CurPtr, SizeTmp);
3419
3420 // Wide string literal.
3421 if (Char == '"')
3422 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3423 tok::wide_string_literal);
3424
3425 // Wide raw string literal.
3426 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3427 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3428 return LexRawStringLiteral(Result,
3429 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3430 SizeTmp2, Result),
3431 tok::wide_string_literal);
3432
3433 // Wide character constant.
3434 if (Char == '\'')
3435 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3436 tok::wide_char_constant);
3437 // FALL THROUGH, treating L like the start of an identifier.
3438 LLVM_FALLTHROUGH[[gnu::fallthrough]];
3439
3440 // C99 6.4.2: Identifiers.
3441 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3442 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3443 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3444 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3445 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3446 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3447 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3448 case 'v': case 'w': case 'x': case 'y': case 'z':
3449 case '_':
3450 // Notify MIOpt that we read a non-whitespace/non-comment token.
3451 MIOpt.ReadToken();
3452 return LexIdentifier(Result, CurPtr);
3453
3454 case '$': // $ in identifiers.
3455 if (LangOpts.DollarIdents) {
3456 if (!isLexingRawMode())
3457 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3458 // Notify MIOpt that we read a non-whitespace/non-comment token.
3459 MIOpt.ReadToken();
3460 return LexIdentifier(Result, CurPtr);
3461 }
3462
3463 Kind = tok::unknown;
3464 break;
3465
3466 // C99 6.4.4: Character Constants.
3467 case '\'':
3468 // Notify MIOpt that we read a non-whitespace/non-comment token.
3469 MIOpt.ReadToken();
3470 return LexCharConstant(Result, CurPtr, tok::char_constant);
3471
3472 // C99 6.4.5: String Literals.
3473 case '"':
3474 // Notify MIOpt that we read a non-whitespace/non-comment token.
3475 MIOpt.ReadToken();
3476 return LexStringLiteral(Result, CurPtr,
3477 ParsingFilename ? tok::header_name
3478 : tok::string_literal);
3479
3480 // C99 6.4.6: Punctuators.
3481 case '?':
3482 Kind = tok::question;
3483 break;
3484 case '[':
3485 Kind = tok::l_square;
3486 break;
3487 case ']':
3488 Kind = tok::r_square;
3489 break;
3490 case '(':
3491 Kind = tok::l_paren;
3492 break;
3493 case ')':
3494 Kind = tok::r_paren;
3495 break;
3496 case '{':
3497 Kind = tok::l_brace;
3498 break;
3499 case '}':
3500 Kind = tok::r_brace;
3501 break;
3502 case '.':
3503 Char = getCharAndSize(CurPtr, SizeTmp);
3504 if (Char >= '0' && Char <= '9') {
3505 // Notify MIOpt that we read a non-whitespace/non-comment token.
3506 MIOpt.ReadToken();
3507
3508 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3509 } else if (LangOpts.CPlusPlus && Char == '*') {
3510 Kind = tok::periodstar;
3511 CurPtr += SizeTmp;
3512 } else if (Char == '.' &&
3513 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3514 Kind = tok::ellipsis;
3515 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3516 SizeTmp2, Result);
3517 } else {
3518 Kind = tok::period;
3519 }
3520 break;
3521 case '&':
3522 Char = getCharAndSize(CurPtr, SizeTmp);
3523 if (Char == '&') {
3524 Kind = tok::ampamp;
3525 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3526 } else if (Char == '=') {
3527 Kind = tok::ampequal;
3528 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3529 } else {
3530 Kind = tok::amp;
3531 }
3532 break;
3533 case '*':
3534 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3535 Kind = tok::starequal;
3536 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3537 } else {
3538 Kind = tok::star;
3539 }
3540 break;
3541 case '+':
3542 Char = getCharAndSize(CurPtr, SizeTmp);
3543 if (Char == '+') {
3544 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3545 Kind = tok::plusplus;
3546 } else if (Char == '=') {
3547 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3548 Kind = tok::plusequal;
3549 } else {
3550 Kind = tok::plus;
3551 }
3552 break;
3553 case '-':
3554 Char = getCharAndSize(CurPtr, SizeTmp);
3555 if (Char == '-') { // --
3556 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3557 Kind = tok::minusminus;
3558 } else if (Char == '>' && LangOpts.CPlusPlus &&
3559 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
3560 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3561 SizeTmp2, Result);
3562 Kind = tok::arrowstar;
3563 } else if (Char == '>') { // ->
3564 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3565 Kind = tok::arrow;
3566 } else if (Char == '=') { // -=
3567 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3568 Kind = tok::minusequal;
3569 } else {
3570 Kind = tok::minus;
3571 }
3572 break;
3573 case '~':
3574 Kind = tok::tilde;
3575 break;
3576 case '!':
3577 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3578 Kind = tok::exclaimequal;
3579 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3580 } else {
3581 Kind = tok::exclaim;
3582 }
3583 break;
3584 case '/':
3585 // 6.4.9: Comments
3586 Char = getCharAndSize(CurPtr, SizeTmp);
3587 if (Char == '/') { // Line comment.
3588 // Even if Line comments are disabled (e.g. in C89 mode), we generally
3589 // want to lex this as a comment. There is one problem with this though,
3590 // that in one particular corner case, this can change the behavior of the
3591 // resultant program. For example, In "foo //**/ bar", C89 would lex
3592 // this as "foo / bar" and languages with Line comments would lex it as
3593 // "foo". Check to see if the character after the second slash is a '*'.
3594 // If so, we will lex that as a "/" instead of the start of a comment.
3595 // However, we never do this if we are just preprocessing.
3596 bool TreatAsComment = LangOpts.LineComment &&
3597 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
3598 if (!TreatAsComment)
3599 if (!(PP && PP->isPreprocessedOutput()))
3600 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3601
3602 if (TreatAsComment) {
3603 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3604 TokAtPhysicalStartOfLine))
3605 return true; // There is a token to return.
3606
3607 // It is common for the tokens immediately after a // comment to be
3608 // whitespace (indentation for the next line). Instead of going through
3609 // the big switch, handle it efficiently now.
3610 goto SkipIgnoredUnits;
3611 }
3612 }
3613
3614 if (Char == '*') { // /**/ comment.
3615 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3616 TokAtPhysicalStartOfLine))
3617 return true; // There is a token to return.
3618
3619 // We only saw whitespace, so just try again with this lexer.
3620 // (We manually eliminate the tail call to avoid recursion.)
3621 goto LexNextToken;
3622 }
3623
3624 if (Char == '=') {
3625 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3626 Kind = tok::slashequal;
3627 } else {
3628 Kind = tok::slash;
3629 }
3630 break;
3631 case '%':
3632 Char = getCharAndSize(CurPtr, SizeTmp);
3633 if (Char == '=') {
3634 Kind = tok::percentequal;
3635 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3636 } else if (LangOpts.Digraphs && Char == '>') {
3637 Kind = tok::r_brace; // '%>' -> '}'
3638 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3639 } else if (LangOpts.Digraphs && Char == ':') {
3640 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3641 Char = getCharAndSize(CurPtr, SizeTmp);
3642 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
3643 Kind = tok::hashhash; // '%:%:' -> '##'
3644 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3645 SizeTmp2, Result);
3646 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
3647 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3648 if (!isLexingRawMode())
3649 Diag(BufferPtr, diag::ext_charize_microsoft);
3650 Kind = tok::hashat;
3651 } else { // '%:' -> '#'
3652 // We parsed a # character. If this occurs at the start of the line,
3653 // it's actually the start of a preprocessing directive. Callback to
3654 // the preprocessor to handle it.
3655 // TODO: -fpreprocessed mode??
3656 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3657 goto HandleDirective;
3658
3659 Kind = tok::hash;
3660 }
3661 } else {
3662 Kind = tok::percent;
3663 }
3664 break;
3665 case '<':
3666 Char = getCharAndSize(CurPtr, SizeTmp);
3667 if (ParsingFilename) {
3668 return LexAngledStringLiteral(Result, CurPtr);
3669 } else if (Char == '<') {
3670 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3671 if (After == '=') {
3672 Kind = tok::lesslessequal;
3673 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3674 SizeTmp2, Result);
3675 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
3676 // If this is actually a '<<<<<<<' version control conflict marker,
3677 // recognize it as such and recover nicely.
3678 goto LexNextToken;
3679 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
3680 // If this is '<<<<' and we're in a Perforce-style conflict marker,
3681 // ignore it.
3682 goto LexNextToken;
3683 } else if (LangOpts.CUDA && After == '<') {
3684 Kind = tok::lesslessless;
3685 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3686 SizeTmp2, Result);
3687 } else {
3688 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3689 Kind = tok::lessless;
3690 }
3691 } else if (Char == '=') {
3692 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3693 if (After == '>') {
3694 if (getLangOpts().CPlusPlus2a) {
3695 if (!isLexingRawMode())
3696 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3697 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3698 SizeTmp2, Result);
3699 Kind = tok::spaceship;
3700 break;
3701 }
3702 // Suggest adding a space between the '<=' and the '>' to avoid a
3703 // change in semantics if this turns up in C++ <=17 mode.
3704 if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
3705 Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
3706 << FixItHint::CreateInsertion(
3707 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3708 }
3709 }
3710 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3711 Kind = tok::lessequal;
3712 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
3713 if (LangOpts.CPlusPlus11 &&
3714 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
3715 // C++0x [lex.pptoken]p3:
3716 // Otherwise, if the next three characters are <:: and the subsequent
3717 // character is neither : nor >, the < is treated as a preprocessor
3718 // token by itself and not as the first character of the alternative
3719 // token <:.
3720 unsigned SizeTmp3;
3721 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3722 if (After != ':' && After != '>') {
3723 Kind = tok::less;
3724 if (!isLexingRawMode())
3725 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3726 break;
3727 }
3728 }
3729
3730 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3731 Kind = tok::l_square;
3732 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
3733 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3734 Kind = tok::l_brace;
3735 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
3736 lexEditorPlaceholder(Result, CurPtr)) {
3737 return true;
3738 } else {
3739 Kind = tok::less;
3740 }
3741 break;
3742 case '>':
3743 Char = getCharAndSize(CurPtr, SizeTmp);
3744 if (Char == '=') {
3745 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3746 Kind = tok::greaterequal;
3747 } else if (Char == '>') {
3748 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3749 if (After == '=') {
3750 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3751 SizeTmp2, Result);
3752 Kind = tok::greatergreaterequal;
3753 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
3754 // If this is actually a '>>>>' conflict marker, recognize it as such
3755 // and recover nicely.
3756 goto LexNextToken;
3757 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
3758 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3759 goto LexNextToken;
3760 } else if (LangOpts.CUDA && After == '>') {
3761 Kind = tok::greatergreatergreater;
3762 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3763 SizeTmp2, Result);
3764 } else {
3765 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3766 Kind = tok::greatergreater;
3767 }
3768 } else {
3769 Kind = tok::greater;
3770 }
3771 break;
3772 case '^':
3773 Char = getCharAndSize(CurPtr, SizeTmp);
3774 if (Char == '=') {
3775 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3776 Kind = tok::caretequal;
3777 } else if (LangOpts.OpenCL && Char == '^') {
3778 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3779 Kind = tok::caretcaret;
3780 } else {
3781 Kind = tok::caret;
3782 }
3783 break;
3784 case '|':
3785 Char = getCharAndSize(CurPtr, SizeTmp);
3786 if (Char == '=') {
3787 Kind = tok::pipeequal;
3788 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3789 } else if (Char == '|') {
3790 // If this is '|||||||' and we're in a conflict marker, ignore it.
3791 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
3792 goto LexNextToken;
3793 Kind = tok::pipepipe;
3794 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3795 } else {
3796 Kind = tok::pipe;
3797 }
3798 break;
3799 case ':':
3800 Char = getCharAndSize(CurPtr, SizeTmp);
3801 if (LangOpts.Digraphs && Char == '>') {
3802 Kind = tok::r_square; // ':>' -> ']'
3803 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3804 } else if ((LangOpts.CPlusPlus ||
3805 LangOpts.DoubleSquareBracketAttributes) &&
3806 Char == ':') {
3807 Kind = tok::coloncolon;
3808 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3809 } else {
3810 Kind = tok::colon;
3811 }
3812 break;
3813 case ';':
3814 Kind = tok::semi;
3815 break;
3816 case '=':
3817 Char = getCharAndSize(CurPtr, SizeTmp);
3818 if (Char == '=') {
3819 // If this is '====' and we're in a conflict marker, ignore it.
3820 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
3821 goto LexNextToken;
3822
3823 Kind = tok::equalequal;
3824 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3825 } else {
3826 Kind = tok::equal;
3827 }
3828 break;
3829 case ',':
3830 Kind = tok::comma;
3831 break;
3832 case '#':
3833 Char = getCharAndSize(CurPtr, SizeTmp);
3834 if (Char == '#') {
3835 Kind = tok::hashhash;
3836 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3837 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
3838 Kind = tok::hashat;
3839 if (!isLexingRawMode())
3840 Diag(BufferPtr, diag::ext_charize_microsoft);
3841 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3842 } else {
3843 // We parsed a # character. If this occurs at the start of the line,
3844 // it's actually the start of a preprocessing directive. Callback to
3845 // the preprocessor to handle it.
3846 // TODO: -fpreprocessed mode??
3847 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3848 goto HandleDirective;
3849
3850 Kind = tok::hash;
3851 }
3852 break;
3853
3854 case '@':
3855 // Objective C support.
3856 if (CurPtr[-1] == '@' && LangOpts.ObjC)
3857 Kind = tok::at;
3858 else
3859 Kind = tok::unknown;
3860 break;
3861
3862 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3863 case '\\':
3864 if (!LangOpts.AsmPreprocessor) {
3865 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3866 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3867 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3868 return true; // KeepWhitespaceMode
3869
3870 // We only saw whitespace, so just try again with this lexer.
3871 // (We manually eliminate the tail call to avoid recursion.)
3872 goto LexNextToken;
3873 }
3874
3875 return LexUnicode(Result, CodePoint, CurPtr);
3876 }
3877 }
3878
3879 Kind = tok::unknown;
3880 break;
3881
3882 default: {
3883 if (isASCII(Char)) {
3884 Kind = tok::unknown;
3885 break;
3886 }
3887
3888 llvm::UTF32 CodePoint;
3889
3890 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3891 // an escaped newline.
3892 --CurPtr;
3893 llvm::ConversionResult Status =
3894 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3895 (const llvm::UTF8 *)BufferEnd,
3896 &CodePoint,
3897 llvm::strictConversion);
3898 if (Status == llvm::conversionOK) {
3899 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3900 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3901 return true; // KeepWhitespaceMode
3902
3903 // We only saw whitespace, so just try again with this lexer.
3904 // (We manually eliminate the tail call to avoid recursion.)
3905 goto LexNextToken;
3906 }
3907 return LexUnicode(Result, CodePoint, CurPtr);
3908 }
3909
3910 if (isLexingRawMode() || ParsingPreprocessorDirective ||
3911 PP->isPreprocessedOutput()) {
3912 ++CurPtr;
3913 Kind = tok::unknown;
3914 break;
3915 }
3916
3917 // Non-ASCII characters tend to creep into source code unintentionally.
3918 // Instead of letting the parser complain about the unknown token,
3919 // just diagnose the invalid UTF-8, then drop the character.
3920 Diag(CurPtr, diag::err_invalid_utf8);
3921
3922 BufferPtr = CurPtr+1;
3923 // We're pretending the character didn't exist, so just try again with
3924 // this lexer.
3925 // (We manually eliminate the tail call to avoid recursion.)
3926 goto LexNextToken;
3927 }
3928 }
3929
3930 // Notify MIOpt that we read a non-whitespace/non-comment token.
3931 MIOpt.ReadToken();
3932
3933 // Update the location of token as well as BufferPtr.
3934 FormTokenWithChars(Result, CurPtr, Kind);
3935 return true;
3936
3937HandleDirective:
3938 // We parsed a # character and it's the start of a preprocessing directive.
3939
3940 FormTokenWithChars(Result, CurPtr, tok::hash);
3941 PP->HandleDirective(Result);
3942
3943 if (PP->hadModuleLoaderFatalFailure()) {
3944 // With a fatal failure in the module loader, we abort parsing.
3945 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof")((Result.is(tok::eof) && "Preprocessor did not set tok:eof"
) ? static_cast<void> (0) : __assert_fail ("Result.is(tok::eof) && \"Preprocessor did not set tok:eof\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/lib/Lex/Lexer.cpp"
, 3945, __PRETTY_FUNCTION__))
;
3946 return true;
3947 }
3948
3949 // We parsed the directive; lex a token with the new state.
3950 return false;
3951}

/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Lexer.h

1//===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the Lexer interface.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_LEX_LEXER_H
14#define LLVM_CLANG_LEX_LEXER_H
15
16#include "clang/Basic/LangOptions.h"
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/TokenKinds.h"
19#include "clang/Lex/PreprocessorLexer.h"
20#include "clang/Lex/Token.h"
21#include "llvm/ADT/Optional.h"
22#include "llvm/ADT/SmallVector.h"
23#include "llvm/ADT/StringRef.h"
24#include <cassert>
25#include <cstdint>
26#include <string>
27
28namespace llvm {
29
30class MemoryBuffer;
31
32} // namespace llvm
33
34namespace clang {
35
36class DiagnosticBuilder;
37class Preprocessor;
38class SourceManager;
39
40/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
41/// recovering from.
42enum ConflictMarkerKind {
43 /// Not within a conflict marker.
44 CMK_None,
45
46 /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
47 /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
48 CMK_Normal,
49
50 /// A Perforce-style conflict marker, initiated by 4 ">"s,
51 /// separated by 4 "="s, and terminated by 4 "<"s.
52 CMK_Perforce
53};
54
55/// Describes the bounds (start, size) of the preamble and a flag required by
56/// PreprocessorOptions::PrecompiledPreambleBytes.
57/// The preamble includes the BOM, if any.
58struct PreambleBounds {
59 /// Size of the preamble in bytes.
60 unsigned Size;
61
62 /// Whether the preamble ends at the start of a new line.
63 ///
64 /// Used to inform the lexer as to whether it's starting at the beginning of
65 /// a line after skipping the preamble.
66 bool PreambleEndsAtStartOfLine;
67
68 PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
69 : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
70};
71
72/// Lexer - This provides a simple interface that turns a text buffer into a
73/// stream of tokens. This provides no support for file reading or buffering,
74/// or buffering/seeking of tokens, only forward lexing is supported. It relies
75/// on the specified Preprocessor object to handle preprocessor directives, etc.
76class Lexer : public PreprocessorLexer {
77 friend class Preprocessor;
78
79 void anchor() override;
80
81 //===--------------------------------------------------------------------===//
82 // Constant configuration values for this lexer.
83
84 // Start of the buffer.
85 const char *BufferStart;
86
87 // End of the buffer.
88 const char *BufferEnd;
89
90 // Location for start of file.
91 SourceLocation FileLoc;
92
93 // LangOpts enabled by this language (cache).
94 LangOptions LangOpts;
95
96 // True if lexer for _Pragma handling.
97 bool Is_PragmaLexer;
98
99 //===--------------------------------------------------------------------===//
100 // Context-specific lexing flags set by the preprocessor.
101 //
102
103 /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
104 /// and return them as tokens. This is used for -C and -CC modes, and
105 /// whitespace preservation can be useful for some clients that want to lex
106 /// the file in raw mode and get every character from the file.
107 ///
108 /// When this is set to 2 it returns comments and whitespace. When set to 1
109 /// it returns comments, when it is set to 0 it returns normal tokens only.
110 unsigned char ExtendedTokenMode;
111
112 //===--------------------------------------------------------------------===//
113 // Context that changes as the file is lexed.
114 // NOTE: any state that mutates when in raw mode must have save/restore code
115 // in Lexer::isNextPPTokenLParen.
116
117 // BufferPtr - Current pointer into the buffer. This is the next character
118 // to be lexed.
119 const char *BufferPtr;
120
121 // IsAtStartOfLine - True if the next lexed token should get the "start of
122 // line" flag set on it.
123 bool IsAtStartOfLine;
124
125 bool IsAtPhysicalStartOfLine;
126
127 bool HasLeadingSpace;
128
129 bool HasLeadingEmptyMacro;
130
131 // CurrentConflictMarkerState - The kind of conflict marker we are handling.
132 ConflictMarkerKind CurrentConflictMarkerState;
133
134 void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
135
136public:
137 /// Lexer constructor - Create a new lexer object for the specified buffer
138 /// with the specified preprocessor managing the lexing process. This lexer
139 /// assumes that the associated file buffer and Preprocessor objects will
140 /// outlive it, so it doesn't take ownership of either of them.
141 Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP);
142
143 /// Lexer constructor - Create a new raw lexer object. This object is only
144 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
145 /// text range will outlive it, so it doesn't take ownership of it.
146 Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
147 const char *BufStart, const char *BufPtr, const char *BufEnd);
148
149 /// Lexer constructor - Create a new raw lexer object. This object is only
150 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
151 /// text range will outlive it, so it doesn't take ownership of it.
152 Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
153 const SourceManager &SM, const LangOptions &LangOpts);
154
155 Lexer(const Lexer &) = delete;
156 Lexer &operator=(const Lexer &) = delete;
157
158 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
159 /// _Pragma expansion. This has a variety of magic semantics that this method
160 /// sets up. It returns a new'd Lexer that must be delete'd when done.
161 static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
162 SourceLocation ExpansionLocStart,
163 SourceLocation ExpansionLocEnd,
164 unsigned TokLen, Preprocessor &PP);
165
166 /// getLangOpts - Return the language features currently enabled.
167 /// NOTE: this lexer modifies features as a file is parsed!
168 const LangOptions &getLangOpts() const { return LangOpts; }
169
170 /// getFileLoc - Return the File Location for the file we are lexing out of.
171 /// The physical location encodes the location where the characters come from,
172 /// the virtual location encodes where we should *claim* the characters came
173 /// from. Currently this is only used by _Pragma handling.
174 SourceLocation getFileLoc() const { return FileLoc; }
175
176private:
177 /// Lex - Return the next token in the file. If this is the end of file, it
178 /// return the tok::eof token. This implicitly involves the preprocessor.
179 bool Lex(Token &Result);
180
181public:
182 /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
183 bool isPragmaLexer() const { return Is_PragmaLexer; }
184
185private:
186 /// IndirectLex - An indirect call to 'Lex' that can be invoked via
187 /// the PreprocessorLexer interface.
188 void IndirectLex(Token &Result) override { Lex(Result); }
189
190public:
191 /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
192 /// associated preprocessor object. Return true if the 'next character to
193 /// read' pointer points at the end of the lexer buffer, false otherwise.
194 bool LexFromRawLexer(Token &Result) {
195 assert(LexingRawMode && "Not already in raw mode!")((LexingRawMode && "Not already in raw mode!") ? static_cast
<void> (0) : __assert_fail ("LexingRawMode && \"Not already in raw mode!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Lexer.h"
, 195, __PRETTY_FUNCTION__))
;
196 Lex(Result);
197 // Note that lexing to the end of the buffer doesn't implicitly delete the
198 // lexer when in raw mode.
199 return BufferPtr == BufferEnd;
200 }
201
202 /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
203 /// every character in the file, including whitespace and comments. This
204 /// should only be used in raw mode, as the preprocessor is not prepared to
205 /// deal with the excess tokens.
206 bool isKeepWhitespaceMode() const {
207 return ExtendedTokenMode > 1;
208 }
209
210 /// SetKeepWhitespaceMode - This method lets clients enable or disable
211 /// whitespace retention mode.
212 void SetKeepWhitespaceMode(bool Val) {
213 assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&(((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
"Can only retain whitespace in raw mode or -traditional-cpp"
) ? static_cast<void> (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Lexer.h"
, 214, __PRETTY_FUNCTION__))
214 "Can only retain whitespace in raw mode or -traditional-cpp")(((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
"Can only retain whitespace in raw mode or -traditional-cpp"
) ? static_cast<void> (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Lexer.h"
, 214, __PRETTY_FUNCTION__))
;
215 ExtendedTokenMode = Val ? 2 : 0;
216 }
217
218 /// inKeepCommentMode - Return true if the lexer should return comments as
219 /// tokens.
220 bool inKeepCommentMode() const {
221 return ExtendedTokenMode > 0;
222 }
223
224 /// SetCommentRetentionMode - Change the comment retention mode of the lexer
225 /// to the specified mode. This is really only useful when lexing in raw
226 /// mode, because otherwise the lexer needs to manage this.
227 void SetCommentRetentionState(bool Mode) {
228 assert(!isKeepWhitespaceMode() &&((!isKeepWhitespaceMode() && "Can't play with comment retention state when retaining whitespace"
) ? static_cast<void> (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Lexer.h"
, 229, __PRETTY_FUNCTION__))
229 "Can't play with comment retention state when retaining whitespace")((!isKeepWhitespaceMode() && "Can't play with comment retention state when retaining whitespace"
) ? static_cast<void> (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Lexer.h"
, 229, __PRETTY_FUNCTION__))
;
230 ExtendedTokenMode = Mode ? 1 : 0;
231 }
232
233 /// Sets the extended token mode back to its initial value, according to the
234 /// language options and preprocessor. This controls whether the lexer
235 /// produces comment and whitespace tokens.
236 ///
237 /// This requires the lexer to have an associated preprocessor. A standalone
238 /// lexer has nothing to reset to.
239 void resetExtendedTokenMode();
240
241 /// Gets source code buffer.
242 StringRef getBuffer() const {
243 return StringRef(BufferStart, BufferEnd - BufferStart);
244 }
245
246 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
247 /// uninterpreted string. This switches the lexer out of directive mode.
248 void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
249
250
251 /// Diag - Forwarding function for diagnostics. This translate a source
252 /// position in the current buffer into a SourceLocation object for rendering.
253 DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
254
255 /// getSourceLocation - Return a source location identifier for the specified
256 /// offset in the current file.
257 SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
258
259 /// getSourceLocation - Return a source location for the next character in
260 /// the current file.
261 SourceLocation getSourceLocation() override {
262 return getSourceLocation(BufferPtr);
263 }
264
265 /// Return the current location in the buffer.
266 const char *getBufferLocation() const { return BufferPtr; }
267
268 /// Returns the current lexing offset.
269 unsigned getCurrentBufferOffset() {
270 assert(BufferPtr >= BufferStart && "Invalid buffer state")((BufferPtr >= BufferStart && "Invalid buffer state"
) ? static_cast<void> (0) : __assert_fail ("BufferPtr >= BufferStart && \"Invalid buffer state\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Lexer.h"
, 270, __PRETTY_FUNCTION__))
;
271 return BufferPtr - BufferStart;
272 }
273
274 /// Skip over \p NumBytes bytes.
275 ///
276 /// If the skip is successful, the next token will be lexed from the new
277 /// offset. The lexer also assumes that we skipped to the start of the line.
278 ///
279 /// \returns true if the skip failed (new offset would have been past the
280 /// end of the buffer), false otherwise.
281 bool skipOver(unsigned NumBytes);
282
283 /// Stringify - Convert the specified string into a C string by i) escaping
284 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
285 /// If Charify is true, this escapes the ' character instead of ".
286 static std::string Stringify(StringRef Str, bool Charify = false);
287
288 /// Stringify - Convert the specified string into a C string by i) escaping
289 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
290 static void Stringify(SmallVectorImpl<char> &Str);
291
292 /// getSpelling - This method is used to get the spelling of a token into a
293 /// preallocated buffer, instead of as an std::string. The caller is required
294 /// to allocate enough space for the token, which is guaranteed to be at least
295 /// Tok.getLength() bytes long. The length of the actual result is returned.
296 ///
297 /// Note that this method may do two possible things: it may either fill in
298 /// the buffer specified with characters, or it may *change the input pointer*
299 /// to point to a constant buffer with the data already in it (avoiding a
300 /// copy). The caller is not allowed to modify the returned buffer pointer
301 /// if an internal buffer is returned.
302 static unsigned getSpelling(const Token &Tok, const char *&Buffer,
303 const SourceManager &SourceMgr,
304 const LangOptions &LangOpts,
305 bool *Invalid = nullptr);
306
307 /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
308 /// token is the characters used to represent the token in the source file
309 /// after trigraph expansion and escaped-newline folding. In particular, this
310 /// wants to get the true, uncanonicalized, spelling of things like digraphs
311 /// UCNs, etc.
312 static std::string getSpelling(const Token &Tok,
313 const SourceManager &SourceMgr,
314 const LangOptions &LangOpts,
315 bool *Invalid = nullptr);
316
317 /// getSpelling - This method is used to get the spelling of the
318 /// token at the given source location. If, as is usually true, it
319 /// is not necessary to copy any data, then the returned string may
320 /// not point into the provided buffer.
321 ///
322 /// This method lexes at the expansion depth of the given
323 /// location and does not jump to the expansion or spelling
324 /// location.
325 static StringRef getSpelling(SourceLocation loc,
326 SmallVectorImpl<char> &buffer,
327 const SourceManager &SM,
328 const LangOptions &options,
329 bool *invalid = nullptr);
330
331 /// MeasureTokenLength - Relex the token at the specified location and return
332 /// its length in bytes in the input file. If the token needs cleaning (e.g.
333 /// includes a trigraph or an escaped newline) then this count includes bytes
334 /// that are part of that.
335 static unsigned MeasureTokenLength(SourceLocation Loc,
336 const SourceManager &SM,
337 const LangOptions &LangOpts);
338
339 /// Relex the token at the specified location.
340 /// \returns true if there was a failure, false on success.
341 static bool getRawToken(SourceLocation Loc, Token &Result,
342 const SourceManager &SM,
343 const LangOptions &LangOpts,
344 bool IgnoreWhiteSpace = false);
345
346 /// Given a location any where in a source buffer, find the location
347 /// that corresponds to the beginning of the token in which the original
348 /// source location lands.
349 static SourceLocation GetBeginningOfToken(SourceLocation Loc,
350 const SourceManager &SM,
351 const LangOptions &LangOpts);
352
353 /// Get the physical length (including trigraphs and escaped newlines) of the
354 /// first \p Characters characters of the token starting at TokStart.
355 static unsigned getTokenPrefixLength(SourceLocation TokStart,
356 unsigned CharNo,
357 const SourceManager &SM,
358 const LangOptions &LangOpts);
359
360 /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
361 /// location at the start of a token, return a new location that specifies a
362 /// character within the token. This handles trigraphs and escaped newlines.
363 static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
364 unsigned Characters,
365 const SourceManager &SM,
366 const LangOptions &LangOpts) {
367 return TokStart.getLocWithOffset(
368 getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
369 }
370
371 /// Computes the source location just past the end of the
372 /// token at this source location.
373 ///
374 /// This routine can be used to produce a source location that
375 /// points just past the end of the token referenced by \p Loc, and
376 /// is generally used when a diagnostic needs to point just after a
377 /// token where it expected something different that it received. If
378 /// the returned source location would not be meaningful (e.g., if
379 /// it points into a macro), this routine returns an invalid
380 /// source location.
381 ///
382 /// \param Offset an offset from the end of the token, where the source
383 /// location should refer to. The default offset (0) produces a source
384 /// location pointing just past the end of the token; an offset of 1 produces
385 /// a source location pointing to the last character in the token, etc.
386 static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
387 const SourceManager &SM,
388 const LangOptions &LangOpts);
389
390 /// Given a token range, produce a corresponding CharSourceRange that
391 /// is not a token range. This allows the source range to be used by
392 /// components that don't have access to the lexer and thus can't find the
393 /// end of the range for themselves.
394 static CharSourceRange getAsCharRange(SourceRange Range,
395 const SourceManager &SM,
396 const LangOptions &LangOpts) {
397 SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
398 return End.isInvalid() ? CharSourceRange()
399 : CharSourceRange::getCharRange(
400 Range.getBegin(), End);
401 }
402 static CharSourceRange getAsCharRange(CharSourceRange Range,
403 const SourceManager &SM,
404 const LangOptions &LangOpts) {
405 return Range.isTokenRange()
406 ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
407 : Range;
408 }
409
410 /// Returns true if the given MacroID location points at the first
411 /// token of the macro expansion.
412 ///
413 /// \param MacroBegin If non-null and function returns true, it is set to
414 /// begin location of the macro.
415 static bool isAtStartOfMacroExpansion(SourceLocation loc,
416 const SourceManager &SM,
417 const LangOptions &LangOpts,
418 SourceLocation *MacroBegin = nullptr);
419
420 /// Returns true if the given MacroID location points at the last
421 /// token of the macro expansion.
422 ///
423 /// \param MacroEnd If non-null and function returns true, it is set to
424 /// end location of the macro.
425 static bool isAtEndOfMacroExpansion(SourceLocation loc,
426 const SourceManager &SM,
427 const LangOptions &LangOpts,
428 SourceLocation *MacroEnd = nullptr);
429
430 /// Accepts a range and returns a character range with file locations.
431 ///
432 /// Returns a null range if a part of the range resides inside a macro
433 /// expansion or the range does not reside on the same FileID.
434 ///
435 /// This function is trying to deal with macros and return a range based on
436 /// file locations. The cases where it can successfully handle macros are:
437 ///
438 /// -begin or end range lies at the start or end of a macro expansion, in
439 /// which case the location will be set to the expansion point, e.g:
440 /// \#define M 1 2
441 /// a M
442 /// If you have a range [a, 2] (where 2 came from the macro), the function
443 /// will return a range for "a M"
444 /// if you have range [a, 1], the function will fail because the range
445 /// overlaps with only a part of the macro
446 ///
447 /// -The macro is a function macro and the range can be mapped to the macro
448 /// arguments, e.g:
449 /// \#define M 1 2
450 /// \#define FM(x) x
451 /// FM(a b M)
452 /// if you have range [b, 2], the function will return the file range "b M"
453 /// inside the macro arguments.
454 /// if you have range [a, 2], the function will return the file range
455 /// "FM(a b M)" since the range includes all of the macro expansion.
456 static CharSourceRange makeFileCharRange(CharSourceRange Range,
457 const SourceManager &SM,
458 const LangOptions &LangOpts);
459
460 /// Returns a string for the source that the range encompasses.
461 static StringRef getSourceText(CharSourceRange Range,
462 const SourceManager &SM,
463 const LangOptions &LangOpts,
464 bool *Invalid = nullptr);
465
466 /// Retrieve the name of the immediate macro expansion.
467 ///
468 /// This routine starts from a source location, and finds the name of the macro
469 /// responsible for its immediate expansion. It looks through any intervening
470 /// macro argument expansions to compute this. It returns a StringRef which
471 /// refers to the SourceManager-owned buffer of the source where that macro
472 /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
473 static StringRef getImmediateMacroName(SourceLocation Loc,
474 const SourceManager &SM,
475 const LangOptions &LangOpts);
476
477 /// Retrieve the name of the immediate macro expansion.
478 ///
479 /// This routine starts from a source location, and finds the name of the
480 /// macro responsible for its immediate expansion. It looks through any
481 /// intervening macro argument expansions to compute this. It returns a
482 /// StringRef which refers to the SourceManager-owned buffer of the source
483 /// where that macro name is spelled. Thus, the result shouldn't out-live
484 /// that SourceManager.
485 ///
486 /// This differs from Lexer::getImmediateMacroName in that any macro argument
487 /// location will result in the topmost function macro that accepted it.
488 /// e.g.
489 /// \code
490 /// MAC1( MAC2(foo) )
491 /// \endcode
492 /// for location of 'foo' token, this function will return "MAC1" while
493 /// Lexer::getImmediateMacroName will return "MAC2".
494 static StringRef getImmediateMacroNameForDiagnostics(
495 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
496
497 /// Compute the preamble of the given file.
498 ///
499 /// The preamble of a file contains the initial comments, include directives,
500 /// and other preprocessor directives that occur before the code in this
501 /// particular file actually begins. The preamble of the main source file is
502 /// a potential prefix header.
503 ///
504 /// \param Buffer The memory buffer containing the file's contents.
505 ///
506 /// \param MaxLines If non-zero, restrict the length of the preamble
507 /// to fewer than this number of lines.
508 ///
509 /// \returns The offset into the file where the preamble ends and the rest
510 /// of the file begins along with a boolean value indicating whether
511 /// the preamble ends at the beginning of a new line.
512 static PreambleBounds ComputePreamble(StringRef Buffer,
513 const LangOptions &LangOpts,
514 unsigned MaxLines = 0);
515
516 /// Finds the token that comes right after the given location.
517 ///
518 /// Returns the next token, or none if the location is inside a macro.
519 static Optional<Token> findNextToken(SourceLocation Loc,
520 const SourceManager &SM,
521 const LangOptions &LangOpts);
522
523 /// Checks that the given token is the first token that occurs after
524 /// the given location (this excludes comments and whitespace). Returns the
525 /// location immediately after the specified token. If the token is not found
526 /// or the location is inside a macro, the returned source location will be
527 /// invalid.
528 static SourceLocation findLocationAfterToken(SourceLocation loc,
529 tok::TokenKind TKind,
530 const SourceManager &SM,
531 const LangOptions &LangOpts,
532 bool SkipTrailingWhitespaceAndNewLine);
533
534 /// Returns true if the given character could appear in an identifier.
535 static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
536
537 /// Checks whether new line pointed by Str is preceded by escape
538 /// sequence.
539 static bool isNewLineEscaped(const char *BufferStart, const char *Str);
540
541 /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
542 /// emit a warning.
543 static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
544 const LangOptions &LangOpts) {
545 // If this is not a trigraph and not a UCN or escaped newline, return
546 // quickly.
547 if (isObviouslySimpleCharacter(Ptr[0])) {
548 Size = 1;
549 return *Ptr;
550 }
551
552 Size = 0;
553 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
554 }
555
556 /// Returns the leading whitespace for line that corresponds to the given
557 /// location \p Loc.
558 static StringRef getIndentationForLine(SourceLocation Loc,
559 const SourceManager &SM);
560
561private:
562 //===--------------------------------------------------------------------===//
563 // Internal implementation interfaces.
564
565 /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
566 /// by Lex.
567 ///
568 bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
569
570 bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
571
572 /// Given that a token begins with the Unicode character \p C, figure out
573 /// what kind of token it is and dispatch to the appropriate lexing helper
574 /// function.
575 bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
576
577 /// FormTokenWithChars - When we lex a token, we have identified a span
578 /// starting at BufferPtr, going to TokEnd that forms the token. This method
579 /// takes that range and assigns it to the token as its location and size. In
580 /// addition, since tokens cannot overlap, this also updates BufferPtr to be
581 /// TokEnd.
582 void FormTokenWithChars(Token &Result, const char *TokEnd,
583 tok::TokenKind Kind) {
584 unsigned TokLen = TokEnd-BufferPtr;
585 Result.setLength(TokLen);
586 Result.setLocation(getSourceLocation(BufferPtr, TokLen));
587 Result.setKind(Kind);
588 BufferPtr = TokEnd;
589 }
590
591 /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
592 /// tok::l_paren token, 0 if it is something else and 2 if there are no more
593 /// tokens in the buffer controlled by this lexer.
594 unsigned isNextPPTokenLParen();
595
596 //===--------------------------------------------------------------------===//
597 // Lexer character reading interfaces.
598
599 // This lexer is built on two interfaces for reading characters, both of which
600 // automatically provide phase 1/2 translation. getAndAdvanceChar is used
601 // when we know that we will be reading a character from the input buffer and
602 // that this character will be part of the result token. This occurs in (f.e.)
603 // string processing, because we know we need to read until we find the
604 // closing '"' character.
605 //
606 // The second interface is the combination of getCharAndSize with
607 // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
608 // returning it and its size. If the lexer decides that this character is
609 // part of the current token, it calls ConsumeChar on it. This two stage
610 // approach allows us to emit diagnostics for characters (e.g. warnings about
611 // trigraphs), knowing that they only are emitted if the character is
612 // consumed.
613
614 /// isObviouslySimpleCharacter - Return true if the specified character is
615 /// obviously the same in translation phase 1 and translation phase 3. This
616 /// can return false for characters that end up being the same, but it will
617 /// never return true for something that needs to be mapped.
618 static bool isObviouslySimpleCharacter(char C) {
619 return C != '?' && C != '\\';
620 }
621
622 /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
623 /// advance over it, and return it. This is tricky in several cases. Here we
624 /// just handle the trivial case and fall-back to the non-inlined
625 /// getCharAndSizeSlow method to handle the hard case.
626 inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
627 // If this is not a trigraph and not a UCN or escaped newline, return
628 // quickly.
629 if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
6
Taking false branch
630
631 unsigned Size = 0;
632 char C = getCharAndSizeSlow(Ptr, Size, &Tok);
7
Calling 'Lexer::getCharAndSizeSlow'
633 Ptr += Size;
634 return C;
635 }
636
637 /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
638 /// and added to a given token, check to see if there are diagnostics that
639 /// need to be emitted or flags that need to be set on the token. If so, do
640 /// it.
641 const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
642 // Normal case, we consumed exactly one token. Just return it.
643 if (Size == 1)
644 return Ptr+Size;
645
646 // Otherwise, re-lex the character with a current token, allowing
647 // diagnostics to be emitted and flags to be set.
648 Size = 0;
649 getCharAndSizeSlow(Ptr, Size, &Tok);
650 return Ptr+Size;
651 }
652
653 /// getCharAndSize - Peek a single 'character' from the specified buffer,
654 /// get its size, and return it. This is tricky in several cases. Here we
655 /// just handle the trivial case and fall-back to the non-inlined
656 /// getCharAndSizeSlow method to handle the hard case.
657 inline char getCharAndSize(const char *Ptr, unsigned &Size) {
658 // If this is not a trigraph and not a UCN or escaped newline, return
659 // quickly.
660 if (isObviouslySimpleCharacter(Ptr[0])) {
661 Size = 1;
662 return *Ptr;
663 }
664
665 Size = 0;
666 return getCharAndSizeSlow(Ptr, Size);
667 }
668
669 /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
670 /// method.
671 char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
672 Token *Tok = nullptr);
673
674 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
675 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
676 /// to this function.
677 static unsigned getEscapedNewLineSize(const char *P);
678
679 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
680 /// them), skip over them and return the first non-escaped-newline found,
681 /// otherwise return P.
682 static const char *SkipEscapedNewLines(const char *P);
683
684 /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
685 /// diagnostic.
686 static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
687 const LangOptions &LangOpts);
688
689 //===--------------------------------------------------------------------===//
690 // Other lexer functions.
691
692 void SetByteOffset(unsigned Offset, bool StartOfLine);
693
694 void PropagateLineStartLeadingSpaceInfo(Token &Result);
695
696 const char *LexUDSuffix(Token &Result, const char *CurPtr,
697 bool IsStringLiteral);
698
699 // Helper functions to lex the remainder of a token of the specific type.
700 bool LexIdentifier (Token &Result, const char *CurPtr);
701 bool LexNumericConstant (Token &Result, const char *CurPtr);
702 bool LexStringLiteral (Token &Result, const char *CurPtr,
703 tok::TokenKind Kind);
704 bool LexRawStringLiteral (Token &Result, const char *CurPtr,
705 tok::TokenKind Kind);
706 bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
707 bool LexCharConstant (Token &Result, const char *CurPtr,
708 tok::TokenKind Kind);
709 bool LexEndOfFile (Token &Result, const char *CurPtr);
710 bool SkipWhitespace (Token &Result, const char *CurPtr,
711 bool &TokAtPhysicalStartOfLine);
712 bool SkipLineComment (Token &Result, const char *CurPtr,
713 bool &TokAtPhysicalStartOfLine);
714 bool SkipBlockComment (Token &Result, const char *CurPtr,
715 bool &TokAtPhysicalStartOfLine);
716 bool SaveLineComment (Token &Result, const char *CurPtr);
717
718 bool IsStartOfConflictMarker(const char *CurPtr);
719 bool HandleEndOfConflictMarker(const char *CurPtr);
720
721 bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
722
723 bool isCodeCompletionPoint(const char *CurPtr) const;
724 void cutOffLexing() { BufferPtr = BufferEnd; }
725
726 bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
727
728 void codeCompleteIncludedFile(const char *PathStart,
729 const char *CompletionPoint, bool IsAngled);
730
731 /// Read a universal character name.
732 ///
733 /// \param StartPtr The position in the source buffer after the initial '\'.
734 /// If the UCN is syntactically well-formed (but not
735 /// necessarily valid), this parameter will be updated to
736 /// point to the character after the UCN.
737 /// \param SlashLoc The position in the source buffer of the '\'.
738 /// \param Result The token being formed. Pass \c nullptr to suppress
739 /// diagnostics and handle token formation in the caller.
740 ///
741 /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
742 /// invalid.
743 uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
744
745 /// Try to consume a UCN as part of an identifier at the current
746 /// location.
747 /// \param CurPtr Initially points to the range of characters in the source
748 /// buffer containing the '\'. Updated to point past the end of
749 /// the UCN on success.
750 /// \param Size The number of characters occupied by the '\' (including
751 /// trigraphs and escaped newlines).
752 /// \param Result The token being produced. Marked as containing a UCN on
753 /// success.
754 /// \return \c true if a UCN was lexed and it produced an acceptable
755 /// identifier character, \c false otherwise.
756 bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
757 Token &Result);
758
759 /// Try to consume an identifier character encoded in UTF-8.
760 /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
761 /// sequence. On success, updated to point past the end of it.
762 /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
763 /// character was lexed, \c false otherwise.
764 bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
765};
766
767} // namespace clang
768
769#endif // LLVM_CLANG_LEX_LEXER_H

/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h

1//===--- Token.h - Token interface ------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the Token interface.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_LEX_TOKEN_H
14#define LLVM_CLANG_LEX_TOKEN_H
15
16#include "clang/Basic/SourceLocation.h"
17#include "clang/Basic/TokenKinds.h"
18#include "llvm/ADT/StringRef.h"
19#include <cassert>
20
21namespace clang {
22
23class IdentifierInfo;
24
25/// Token - This structure provides full information about a lexed token.
26/// It is not intended to be space efficient, it is intended to return as much
27/// information as possible about each returned token. This is expected to be
28/// compressed into a smaller form if memory footprint is important.
29///
30/// The parser can create a special "annotation token" representing a stream of
31/// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
32/// can be represented by a single typename annotation token that carries
33/// information about the SourceRange of the tokens and the type object.
34class Token {
35 /// The location of the token. This is actually a SourceLocation.
36 unsigned Loc;
37
38 // Conceptually these next two fields could be in a union. However, this
39 // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
40 // routine. Keeping as separate members with casts until a more beautiful fix
41 // presents itself.
42
43 /// UintData - This holds either the length of the token text, when
44 /// a normal token, or the end of the SourceRange when an annotation
45 /// token.
46 unsigned UintData;
47
48 /// PtrData - This is a union of four different pointer types, which depends
49 /// on what type of token this is:
50 /// Identifiers, keywords, etc:
51 /// This is an IdentifierInfo*, which contains the uniqued identifier
52 /// spelling.
53 /// Literals: isLiteral() returns true.
54 /// This is a pointer to the start of the token in a text buffer, which
55 /// may be dirty (have trigraphs / escaped newlines).
56 /// Annotations (resolved type names, C++ scopes, etc): isAnnotation().
57 /// This is a pointer to sema-specific data for the annotation token.
58 /// Eof:
59 // This is a pointer to a Decl.
60 /// Other:
61 /// This is null.
62 void *PtrData;
63
64 /// Kind - The actual flavor of token this is.
65 tok::TokenKind Kind;
66
67 /// Flags - Bits we track about this token, members of the TokenFlags enum.
68 unsigned short Flags;
69
70public:
71 // Various flags set per token:
72 enum TokenFlags {
73 StartOfLine = 0x01, // At start of line or only after whitespace
74 // (considering the line after macro expansion).
75 LeadingSpace = 0x02, // Whitespace exists before this token (considering
76 // whitespace after macro expansion).
77 DisableExpand = 0x04, // This identifier may never be macro expanded.
78 NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
79 LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
80 HasUDSuffix = 0x20, // This string or character literal has a ud-suffix.
81 HasUCN = 0x40, // This identifier contains a UCN.
82 IgnoredComma = 0x80, // This comma is not a macro argument separator (MS).
83 StringifiedInMacro = 0x100, // This string or character literal is formed by
84 // macro stringizing or charizing operator.
85 CommaAfterElided = 0x200, // The comma following this token was elided (MS).
86 IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
87 IsReinjected = 0x800, // A phase 4 token that was produced before and
88 // re-added, e.g. via EnterTokenStream. Annotation
89 // tokens are *not* reinjected.
90 };
91
92 tok::TokenKind getKind() const { return Kind; }
93 void setKind(tok::TokenKind K) { Kind = K; }
94
95 /// is/isNot - Predicates to check if this token is a specific kind, as in
96 /// "if (Tok.is(tok::l_brace)) {...}".
97 bool is(tok::TokenKind K) const { return Kind == K; }
98 bool isNot(tok::TokenKind K) const { return Kind != K; }
99 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
100 return is(K1) || is(K2);
101 }
102 template <typename... Ts>
103 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, Ts... Ks) const {
104 return is(K1) || isOneOf(K2, Ks...);
105 }
106
107 /// Return true if this is a raw identifier (when lexing
108 /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode).
109 bool isAnyIdentifier() const {
110 return tok::isAnyIdentifier(getKind());
111 }
112
113 /// Return true if this is a "literal", like a numeric
114 /// constant, string, etc.
115 bool isLiteral() const {
116 return tok::isLiteral(getKind());
117 }
118
119 /// Return true if this is any of tok::annot_* kind tokens.
120 bool isAnnotation() const {
121 return tok::isAnnotation(getKind());
122 }
123
124 /// Return a source location identifier for the specified
125 /// offset in the current file.
126 SourceLocation getLocation() const {
127 return SourceLocation::getFromRawEncoding(Loc);
128 }
129 unsigned getLength() const {
130 assert(!isAnnotation() && "Annotation tokens have no length field")((!isAnnotation() && "Annotation tokens have no length field"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 130, __PRETTY_FUNCTION__))
;
131 return UintData;
132 }
133
134 void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); }
135 void setLength(unsigned Len) {
136 assert(!isAnnotation() && "Annotation tokens have no length field")((!isAnnotation() && "Annotation tokens have no length field"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 136, __PRETTY_FUNCTION__))
;
137 UintData = Len;
138 }
139
140 SourceLocation getAnnotationEndLoc() const {
141 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")((isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 141, __PRETTY_FUNCTION__))
;
142 return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc);
143 }
144 void setAnnotationEndLoc(SourceLocation L) {
145 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")((isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 145, __PRETTY_FUNCTION__))
;
146 UintData = L.getRawEncoding();
147 }
148
149 SourceLocation getLastLoc() const {
150 return isAnnotation() ? getAnnotationEndLoc() : getLocation();
151 }
152
153 SourceLocation getEndLoc() const {
154 return isAnnotation() ? getAnnotationEndLoc()
155 : getLocation().getLocWithOffset(getLength());
156 }
157
158 /// SourceRange of the group of tokens that this annotation token
159 /// represents.
160 SourceRange getAnnotationRange() const {
161 return SourceRange(getLocation(), getAnnotationEndLoc());
162 }
163 void setAnnotationRange(SourceRange R) {
164 setLocation(R.getBegin());
165 setAnnotationEndLoc(R.getEnd());
166 }
167
168 const char *getName() const { return tok::getTokenName(Kind); }
169
170 /// Reset all flags to cleared.
171 void startToken() {
172 Kind = tok::unknown;
173 Flags = 0;
174 PtrData = nullptr;
175 UintData = 0;
176 Loc = SourceLocation().getRawEncoding();
177 }
178
179 IdentifierInfo *getIdentifierInfo() const {
180 assert(isNot(tok::raw_identifier) &&((isNot(tok::raw_identifier) && "getIdentifierInfo() on a tok::raw_identifier token!"
) ? static_cast<void> (0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 181, __PRETTY_FUNCTION__))
181 "getIdentifierInfo() on a tok::raw_identifier token!")((isNot(tok::raw_identifier) && "getIdentifierInfo() on a tok::raw_identifier token!"
) ? static_cast<void> (0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 181, __PRETTY_FUNCTION__))
;
182 assert(!isAnnotation() &&((!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 183, __PRETTY_FUNCTION__))
183 "getIdentifierInfo() on an annotation token!")((!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 183, __PRETTY_FUNCTION__))
;
184 if (isLiteral()) return nullptr;
185 if (is(tok::eof)) return nullptr;
186 return (IdentifierInfo*) PtrData;
187 }
188 void setIdentifierInfo(IdentifierInfo *II) {
189 PtrData = (void*) II;
190 }
191
192 const void *getEofData() const {
193 assert(is(tok::eof))((is(tok::eof)) ? static_cast<void> (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 193, __PRETTY_FUNCTION__))
;
194 return reinterpret_cast<const void *>(PtrData);
195 }
196 void setEofData(const void *D) {
197 assert(is(tok::eof))((is(tok::eof)) ? static_cast<void> (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 197, __PRETTY_FUNCTION__))
;
198 assert(!PtrData)((!PtrData) ? static_cast<void> (0) : __assert_fail ("!PtrData"
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 198, __PRETTY_FUNCTION__))
;
199 PtrData = const_cast<void *>(D);
200 }
201
202 /// getRawIdentifier - For a raw identifier token (i.e., an identifier
203 /// lexed in raw mode), returns a reference to the text substring in the
204 /// buffer if known.
205 StringRef getRawIdentifier() const {
206 assert(is(tok::raw_identifier))((is(tok::raw_identifier)) ? static_cast<void> (0) : __assert_fail
("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 206, __PRETTY_FUNCTION__))
;
207 return StringRef(reinterpret_cast<const char *>(PtrData), getLength());
208 }
209 void setRawIdentifierData(const char *Ptr) {
210 assert(is(tok::raw_identifier))((is(tok::raw_identifier)) ? static_cast<void> (0) : __assert_fail
("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 210, __PRETTY_FUNCTION__))
;
211 PtrData = const_cast<char*>(Ptr);
212 }
213
214 /// getLiteralData - For a literal token (numeric constant, string, etc), this
215 /// returns a pointer to the start of it in the text buffer if known, null
216 /// otherwise.
217 const char *getLiteralData() const {
218 assert(isLiteral() && "Cannot get literal data of non-literal")((isLiteral() && "Cannot get literal data of non-literal"
) ? static_cast<void> (0) : __assert_fail ("isLiteral() && \"Cannot get literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 218, __PRETTY_FUNCTION__))
;
219 return reinterpret_cast<const char*>(PtrData);
220 }
221 void setLiteralData(const char *Ptr) {
222 assert(isLiteral() && "Cannot set literal data of non-literal")((isLiteral() && "Cannot set literal data of non-literal"
) ? static_cast<void> (0) : __assert_fail ("isLiteral() && \"Cannot set literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 222, __PRETTY_FUNCTION__))
;
223 PtrData = const_cast<char*>(Ptr);
224 }
225
226 void *getAnnotationValue() const {
227 assert(isAnnotation() && "Used AnnotVal on non-annotation token")((isAnnotation() && "Used AnnotVal on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 227, __PRETTY_FUNCTION__))
;
228 return PtrData;
229 }
230 void setAnnotationValue(void *val) {
231 assert(isAnnotation() && "Used AnnotVal on non-annotation token")((isAnnotation() && "Used AnnotVal on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-10~svn374877/tools/clang/include/clang/Lex/Token.h"
, 231, __PRETTY_FUNCTION__))
;
232 PtrData = val;
233 }
234
235 /// Set the specified flag.
236 void setFlag(TokenFlags Flag) {
237 Flags |= Flag;
15
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage
238 }
239
240 /// Get the specified flag.
241 bool getFlag(TokenFlags Flag) const {
242 return (Flags & Flag) != 0;
243 }
244
245 /// Unset the specified flag.
246 void clearFlag(TokenFlags Flag) {
247 Flags &= ~Flag;
248 }
249
250 /// Return the internal represtation of the flags.
251 ///
252 /// This is only intended for low-level operations such as writing tokens to
253 /// disk.
254 unsigned getFlags() const {
255 return Flags;
256 }
257
258 /// Set a flag to either true or false.
259 void setFlagValue(TokenFlags Flag, bool Val) {
260 if (Val)
261 setFlag(Flag);
262 else
263 clearFlag(Flag);
264 }
265
266 /// isAtStartOfLine - Return true if this token is at the start of a line.
267 ///
268 bool isAtStartOfLine() const { return getFlag(StartOfLine); }
269
270 /// Return true if this token has whitespace before it.
271 ///
272 bool hasLeadingSpace() const { return getFlag(LeadingSpace); }
273
274 /// Return true if this identifier token should never
275 /// be expanded in the future, due to C99 6.10.3.4p2.
276 bool isExpandDisabled() const { return getFlag(DisableExpand); }
277
278 /// Return true if we have an ObjC keyword identifier.
279 bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const;
280
281 /// Return the ObjC keyword kind.
282 tok::ObjCKeywordKind getObjCKeywordID() const;
283
284 /// Return true if this token has trigraphs or escaped newlines in it.
285 bool needsCleaning() const { return getFlag(NeedsCleaning); }
286
287 /// Return true if this token has an empty macro before it.
288 ///
289 bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); }
290
291 /// Return true if this token is a string or character literal which
292 /// has a ud-suffix.
293 bool hasUDSuffix() const { return getFlag(HasUDSuffix); }
294
295 /// Returns true if this token contains a universal character name.
296 bool hasUCN() const { return getFlag(HasUCN); }
297
298 /// Returns true if this token is formed by macro by stringizing or charizing
299 /// operator.
300 bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); }
301
302 /// Returns true if the comma after this token was elided.
303 bool commaAfterElided() const { return getFlag(CommaAfterElided); }
304
305 /// Returns true if this token is an editor placeholder.
306 ///
307 /// Editor placeholders are produced by the code-completion engine and are
308 /// represented as characters between '<#' and '#>' in the source code. The
309 /// lexer uses identifier tokens to represent placeholders.
310 bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
311};
312
313/// Information about the conditional stack (\#if directives)
314/// currently active.
315struct PPConditionalInfo {
316 /// Location where the conditional started.
317 SourceLocation IfLoc;
318
319 /// True if this was contained in a skipping directive, e.g.,
320 /// in a "\#if 0" block.
321 bool WasSkipping;
322
323 /// True if we have emitted tokens already, and now we're in
324 /// an \#else block or something. Only useful in Skipping blocks.
325 bool FoundNonSkip;
326
327 /// True if we've seen a \#else in this block. If so,
328 /// \#elif/\#else directives are not allowed.
329 bool FoundElse;
330};
331
332} // end namespace clang
333
334#endif // LLVM_CLANG_LEX_TOKEN_H