clang  3.9.0
ASTMatchers/Dynamic/Parser.cpp
Go to the documentation of this file.
1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Recursive parser implementation for the matcher expression grammar.
12 ///
13 //===----------------------------------------------------------------------===//
14 
17 #include "clang/Basic/CharInfo.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/Support/ManagedStatic.h"
21 #include <string>
22 #include <vector>
23 
24 namespace clang {
25 namespace ast_matchers {
26 namespace dynamic {
27 
28 /// \brief Simple structure to hold information for one token from the parser.
30  /// \brief Different possible tokens.
31  enum TokenKind {
42  };
43 
44  /// \brief Some known identifiers.
45  static const char* const ID_Bind;
46 
47  TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
48 
49  StringRef Text;
53 };
54 
55 const char* const Parser::TokenInfo::ID_Bind = "bind";
56 
57 /// \brief Simple tokenizer for the parser.
59 public:
60  explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
61  : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
62  CodeCompletionLocation(nullptr) {
63  NextToken = getNextToken();
64  }
65 
66  CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
67  unsigned CodeCompletionOffset)
68  : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
69  CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
70  NextToken = getNextToken();
71  }
72 
73  /// \brief Returns but doesn't consume the next token.
74  const TokenInfo &peekNextToken() const { return NextToken; }
75 
76  /// \brief Consumes and returns the next token.
78  TokenInfo ThisToken = NextToken;
79  NextToken = getNextToken();
80  return ThisToken;
81  }
82 
83  TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
84 
85 private:
86  TokenInfo getNextToken() {
87  consumeWhitespace();
89  Result.Range.Start = currentLocation();
90 
91  if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
93  Result.Text = StringRef(CodeCompletionLocation, 0);
94  CodeCompletionLocation = nullptr;
95  return Result;
96  }
97 
98  if (Code.empty()) {
99  Result.Kind = TokenInfo::TK_Eof;
100  Result.Text = "";
101  return Result;
102  }
103 
104  switch (Code[0]) {
105  case ',':
106  Result.Kind = TokenInfo::TK_Comma;
107  Result.Text = Code.substr(0, 1);
108  Code = Code.drop_front();
109  break;
110  case '.':
111  Result.Kind = TokenInfo::TK_Period;
112  Result.Text = Code.substr(0, 1);
113  Code = Code.drop_front();
114  break;
115  case '(':
116  Result.Kind = TokenInfo::TK_OpenParen;
117  Result.Text = Code.substr(0, 1);
118  Code = Code.drop_front();
119  break;
120  case ')':
122  Result.Text = Code.substr(0, 1);
123  Code = Code.drop_front();
124  break;
125 
126  case '"':
127  case '\'':
128  // Parse a string literal.
129  consumeStringLiteral(&Result);
130  break;
131 
132  case '0': case '1': case '2': case '3': case '4':
133  case '5': case '6': case '7': case '8': case '9':
134  // Parse an unsigned literal.
135  consumeUnsignedLiteral(&Result);
136  break;
137 
138  default:
139  if (isAlphanumeric(Code[0])) {
140  // Parse an identifier
141  size_t TokenLength = 1;
142  while (1) {
143  // A code completion location in/immediately after an identifier will
144  // cause the portion of the identifier before the code completion
145  // location to become a code completion token.
146  if (CodeCompletionLocation == Code.data() + TokenLength) {
147  CodeCompletionLocation = nullptr;
149  Result.Text = Code.substr(0, TokenLength);
150  Code = Code.drop_front(TokenLength);
151  return Result;
152  }
153  if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
154  break;
155  ++TokenLength;
156  }
157  Result.Kind = TokenInfo::TK_Ident;
158  Result.Text = Code.substr(0, TokenLength);
159  Code = Code.drop_front(TokenLength);
160  } else {
162  Result.Text = Code.substr(0, 1);
163  Code = Code.drop_front(1);
164  }
165  break;
166  }
167 
168  Result.Range.End = currentLocation();
169  return Result;
170  }
171 
172  /// \brief Consume an unsigned literal.
173  void consumeUnsignedLiteral(TokenInfo *Result) {
174  unsigned Length = 1;
175  if (Code.size() > 1) {
176  // Consume the 'x' or 'b' radix modifier, if present.
177  switch (toLowercase(Code[1])) {
178  case 'x': case 'b': Length = 2;
179  }
180  }
181  while (Length < Code.size() && isHexDigit(Code[Length]))
182  ++Length;
183 
184  Result->Text = Code.substr(0, Length);
185  Code = Code.drop_front(Length);
186 
187  unsigned Value;
188  if (!Result->Text.getAsInteger(0, Value)) {
189  Result->Kind = TokenInfo::TK_Literal;
190  Result->Value = Value;
191  } else {
192  SourceRange Range;
193  Range.Start = Result->Range.Start;
194  Range.End = currentLocation();
195  Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
196  Result->Kind = TokenInfo::TK_Error;
197  }
198  }
199 
200  /// \brief Consume a string literal.
201  ///
202  /// \c Code must be positioned at the start of the literal (the opening
203  /// quote). Consumed until it finds the same closing quote character.
204  void consumeStringLiteral(TokenInfo *Result) {
205  bool InEscape = false;
206  const char Marker = Code[0];
207  for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
208  if (InEscape) {
209  InEscape = false;
210  continue;
211  }
212  if (Code[Length] == '\\') {
213  InEscape = true;
214  continue;
215  }
216  if (Code[Length] == Marker) {
217  Result->Kind = TokenInfo::TK_Literal;
218  Result->Text = Code.substr(0, Length + 1);
219  Result->Value = Code.substr(1, Length - 1);
220  Code = Code.drop_front(Length + 1);
221  return;
222  }
223  }
224 
225  StringRef ErrorText = Code;
226  Code = Code.drop_front(Code.size());
227  SourceRange Range;
228  Range.Start = Result->Range.Start;
229  Range.End = currentLocation();
230  Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
231  Result->Kind = TokenInfo::TK_Error;
232  }
233 
234  /// \brief Consume all leading whitespace from \c Code.
235  void consumeWhitespace() {
236  while (!Code.empty() && isWhitespace(Code[0])) {
237  if (Code[0] == '\n') {
238  ++Line;
239  StartOfLine = Code.drop_front();
240  }
241  Code = Code.drop_front();
242  }
243  }
244 
245  SourceLocation currentLocation() {
246  SourceLocation Location;
247  Location.Line = Line;
248  Location.Column = Code.data() - StartOfLine.data() + 1;
249  return Location;
250  }
251 
252  StringRef Code;
253  StringRef StartOfLine;
254  unsigned Line;
255  Diagnostics *Error;
256  TokenInfo NextToken;
257  const char *CodeCompletionLocation;
258 };
259 
261 
263  llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
264  return std::vector<ArgKind>();
265 }
266 
267 std::vector<MatcherCompletion>
269  return std::vector<MatcherCompletion>();
270 }
271 
274 
276  P->ContextStack.push_back(std::make_pair(C, 0u));
277  }
278 
280  P->ContextStack.pop_back();
281  }
282 
283  void nextArg() {
284  ++P->ContextStack.back().second;
285  }
286 };
287 
288 /// \brief Parse expressions that start with an identifier.
289 ///
290 /// This function can parse named values and matchers.
291 /// In case of failure it will try to determine the user's intent to give
292 /// an appropriate error message.
293 bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
294  const TokenInfo NameToken = Tokenizer->consumeNextToken();
295 
296  if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
297  // Parse as a named value.
298  if (const VariantValue NamedValue =
299  NamedValues ? NamedValues->lookup(NameToken.Text)
300  : VariantValue()) {
301  *Value = NamedValue;
302  return true;
303  }
304  // If the syntax is correct and the name is not a matcher either, report
305  // unknown named value.
306  if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
307  Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
308  Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
309  !S->lookupMatcherCtor(NameToken.Text)) {
310  Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
311  << NameToken.Text;
312  return false;
313  }
314  // Otherwise, fallback to the matcher parser.
315  }
316 
317  // Parse as a matcher expression.
318  return parseMatcherExpressionImpl(NameToken, Value);
319 }
320 
321 /// \brief Parse and validate a matcher expression.
322 /// \return \c true on success, in which case \c Value has the matcher parsed.
323 /// If the input is malformed, or some argument has an error, it
324 /// returns \c false.
325 bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
326  VariantValue *Value) {
327  assert(NameToken.Kind == TokenInfo::TK_Ident);
328  const TokenInfo OpenToken = Tokenizer->consumeNextToken();
329  if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
330  Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
331  << OpenToken.Text;
332  return false;
333  }
334 
335  llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
336 
337  if (!Ctor) {
338  Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
339  << NameToken.Text;
340  // Do not return here. We need to continue to give completion suggestions.
341  }
342 
343  std::vector<ParserValue> Args;
344  TokenInfo EndToken;
345 
346  {
347  ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
348 
349  while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
350  if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
351  // End of args.
352  EndToken = Tokenizer->consumeNextToken();
353  break;
354  }
355  if (Args.size() > 0) {
356  // We must find a , token to continue.
357  const TokenInfo CommaToken = Tokenizer->consumeNextToken();
358  if (CommaToken.Kind != TokenInfo::TK_Comma) {
359  Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
360  << CommaToken.Text;
361  return false;
362  }
363  }
364 
366  NameToken.Text, NameToken.Range,
367  Args.size() + 1);
368  ParserValue ArgValue;
369  ArgValue.Text = Tokenizer->peekNextToken().Text;
370  ArgValue.Range = Tokenizer->peekNextToken().Range;
371  if (!parseExpressionImpl(&ArgValue.Value)) {
372  return false;
373  }
374 
375  Args.push_back(ArgValue);
376  SCE.nextArg();
377  }
378  }
379 
380  if (EndToken.Kind == TokenInfo::TK_Eof) {
381  Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
382  return false;
383  }
384 
385  std::string BindID;
386  if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
387  // Parse .bind("foo")
388  Tokenizer->consumeNextToken(); // consume the period.
389  const TokenInfo BindToken = Tokenizer->consumeNextToken();
390  if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
391  addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1));
392  return false;
393  }
394 
395  const TokenInfo OpenToken = Tokenizer->consumeNextToken();
396  const TokenInfo IDToken = Tokenizer->consumeNextToken();
397  const TokenInfo CloseToken = Tokenizer->consumeNextToken();
398 
399  // TODO: We could use different error codes for each/some to be more
400  // explicit about the syntax error.
401  if (BindToken.Kind != TokenInfo::TK_Ident ||
402  BindToken.Text != TokenInfo::ID_Bind) {
403  Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
404  return false;
405  }
406  if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
407  Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
408  return false;
409  }
410  if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
411  Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
412  return false;
413  }
414  if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
415  Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
416  return false;
417  }
418  BindID = IDToken.Value.getString();
419  }
420 
421  if (!Ctor)
422  return false;
423 
424  // Merge the start and end infos.
426  NameToken.Text, NameToken.Range);
427  SourceRange MatcherRange = NameToken.Range;
428  MatcherRange.End = EndToken.Range.End;
429  VariantMatcher Result = S->actOnMatcherExpression(
430  *Ctor, MatcherRange, BindID, Args, Error);
431  if (Result.isNull()) return false;
432 
433  *Value = Result;
434  return true;
435 }
436 
437 // If the prefix of this completion matches the completion token, add it to
438 // Completions minus the prefix.
439 void Parser::addCompletion(const TokenInfo &CompToken,
440  const MatcherCompletion& Completion) {
441  if (StringRef(Completion.TypedText).startswith(CompToken.Text) &&
442  Completion.Specificity > 0) {
443  Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()),
444  Completion.MatcherDecl, Completion.Specificity);
445  }
446 }
447 
448 std::vector<MatcherCompletion> Parser::getNamedValueCompletions(
449  ArrayRef<ArgKind> AcceptedTypes) {
450  if (!NamedValues) return std::vector<MatcherCompletion>();
451  std::vector<MatcherCompletion> Result;
452  for (const auto &Entry : *NamedValues) {
453  unsigned Specificity;
454  if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) {
455  std::string Decl =
456  (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str();
457  Result.emplace_back(Entry.getKey(), Decl, Specificity);
458  }
459  }
460  return Result;
461 }
462 
463 void Parser::addExpressionCompletions() {
464  const TokenInfo CompToken = Tokenizer->consumeNextToken();
465  assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
466 
467  // We cannot complete code if there is an invalid element on the context
468  // stack.
469  for (ContextStackTy::iterator I = ContextStack.begin(),
470  E = ContextStack.end();
471  I != E; ++I) {
472  if (!I->first)
473  return;
474  }
475 
476  auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack);
477  for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) {
478  addCompletion(CompToken, Completion);
479  }
480 
481  for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) {
482  addCompletion(CompToken, Completion);
483  }
484 }
485 
486 /// \brief Parse an <Expresssion>
487 bool Parser::parseExpressionImpl(VariantValue *Value) {
488  switch (Tokenizer->nextTokenKind()) {
490  *Value = Tokenizer->consumeNextToken().Value;
491  return true;
492 
493  case TokenInfo::TK_Ident:
494  return parseIdentifierPrefixImpl(Value);
495 
497  addExpressionCompletions();
498  return false;
499 
500  case TokenInfo::TK_Eof:
501  Error->addError(Tokenizer->consumeNextToken().Range,
502  Error->ET_ParserNoCode);
503  return false;
504 
505  case TokenInfo::TK_Error:
506  // This error was already reported by the tokenizer.
507  return false;
508 
511  case TokenInfo::TK_Comma:
514  const TokenInfo Token = Tokenizer->consumeNextToken();
515  Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
516  return false;
517  }
518 
519  llvm_unreachable("Unknown token kind.");
520 }
521 
522 static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema;
523 
524 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
525  const NamedValueMap *NamedValues, Diagnostics *Error)
526  : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema),
527  NamedValues(NamedValues), Error(Error) {}
528 
529 Parser::RegistrySema::~RegistrySema() {}
530 
532 Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
533  return Registry::lookupMatcherCtor(MatcherName);
534 }
535 
536 VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
537  MatcherCtor Ctor, SourceRange NameRange, StringRef BindID,
538  ArrayRef<ParserValue> Args, Diagnostics *Error) {
539  if (BindID.empty()) {
540  return Registry::constructMatcher(Ctor, NameRange, Args, Error);
541  } else {
542  return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
543  Error);
544  }
545 }
546 
547 std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
548  ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
549  return Registry::getAcceptedCompletionTypes(Context);
550 }
551 
552 std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
553  ArrayRef<ArgKind> AcceptedTypes) {
554  return Registry::getMatcherCompletions(AcceptedTypes);
555 }
556 
557 bool Parser::parseExpression(StringRef Code, Sema *S,
558  const NamedValueMap *NamedValues,
559  VariantValue *Value, Diagnostics *Error) {
560  CodeTokenizer Tokenizer(Code, Error);
561  if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value))
562  return false;
563  if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
564  Error->addError(Tokenizer.peekNextToken().Range,
565  Error->ET_ParserTrailingCode);
566  return false;
567  }
568  return true;
569 }
570 
571 std::vector<MatcherCompletion>
572 Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S,
573  const NamedValueMap *NamedValues) {
574  Diagnostics Error;
575  CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
576  Parser P(&Tokenizer, S, NamedValues, &Error);
577  VariantValue Dummy;
578  P.parseExpressionImpl(&Dummy);
579 
580  // Sort by specificity, then by name.
581  std::sort(P.Completions.begin(), P.Completions.end(),
582  [](const MatcherCompletion &A, const MatcherCompletion &B) {
583  if (A.Specificity != B.Specificity)
584  return A.Specificity > B.Specificity;
585  return A.TypedText < B.TypedText;
586  });
587 
588  return P.Completions;
589 }
590 
592 Parser::parseMatcherExpression(StringRef Code, Sema *S,
593  const NamedValueMap *NamedValues,
594  Diagnostics *Error) {
596  if (!parseExpression(Code, S, NamedValues, &Value, Error))
598  if (!Value.isMatcher()) {
599  Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
601  }
603  Value.getMatcher().getSingleMatcher();
604  if (!Result.hasValue()) {
606  << Value.getTypeAsString();
607  }
608  return Result;
609 }
610 
611 } // namespace dynamic
612 } // namespace ast_matchers
613 } // namespace clang
Simple matcher expression parser.
bool isMatcher() const
Matcher value functions.
unsigned Length
static LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.
Definition: CharInfo.h:88
std::string getTypeAsString() const
String representation of the type of the value.
Registry of all known matchers.
CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
CodeTokenizer(StringRef MatcherCode, Diagnostics *Error, unsigned CodeCompletionOffset)
StringRef P
virtual llvm::Optional< MatcherCtor > lookupMatcherCtor(StringRef MatcherName)=0
Look up a matcher by name.
static const char *const ID_Bind
Some known identifiers.
detail::InMemoryDirectory::const_iterator I
Sema - This implements semantic analysis and AST building for C.
Definition: Sema.h:263
ASTContext * Context
virtual VariantMatcher actOnMatcherExpression(MatcherCtor Ctor, SourceRange NameRange, StringRef BindID, ArrayRef< ParserValue > Args, Diagnostics *Error)=0
Process a matcher expression.
FormatToken * Token
The result type of a method or function.
const TokenInfo & peekNextToken() const
Returns but doesn't consume the next token.
Helper class to manage error messages.
Definition: Diagnostics.h:51
ArgStream addError(SourceRange Range, ErrorType Error)
Add an error to the diagnostics.
Definition: Diagnostics.cpp:66
Kind
const TemplateArgument * iterator
Definition: Type.h:4233
static LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
Definition: CharInfo.h:118
unsigned Specificity
Value corresponding to the "specificity" of the converted matcher.
Definition: Registry.h:55
std::string TypedText
The text to type to select this matcher.
Definition: Registry.h:45
static llvm::ManagedStatic< Parser::RegistrySema > DefaultRegistrySema
Interface to connect the parser with the registry and more.
Simple structure to hold information for one token from the parser.
detail::InMemoryDirectory::const_iterator E
virtual std::vector< ArgKind > getAcceptedCompletionTypes(llvm::ArrayRef< std::pair< MatcherCtor, unsigned >> Context)
Compute the list of completion types for Context.
static LLVM_READONLY char toLowercase(char c)
Converts the given ASCII character to its lowercase equivalent.
Definition: CharInfo.h:165
virtual std::vector< MatcherCompletion > getMatcherCompletions(llvm::ArrayRef< ArgKind > AcceptedTypes)
Compute the list of completions that match any of AcceptedTypes.
TokenInfo consumeNextToken()
Consumes and returns the next token.
const VariantMatcher & getMatcher() const
llvm::StringMap< VariantValue > NamedValueMap
llvm::Optional< DynTypedMatcher > getSingleMatcher() const
Return a single matcher, if there is no ambiguity.
static LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].
Definition: CharInfo.h:124