#!/usr/bin/env python # # Copyright 2007 Neal Norwitz # Portions Copyright 2007 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Generate an Abstract Syntax Tree (AST) for C++.""" # FIXME: # * Tokens should never be exported, need to convert to Nodes # (return types, parameters, etc.) # * Handle static class data for templatized classes # * Handle casts (both C++ and C-style) # * Handle conditions and loops (if/else, switch, for, while/do) # # TODO much, much later: # * Handle #define # * exceptions try: # Python 3.x import builtins except ImportError: # Python 2.x import __builtin__ as builtins import collections import sys import traceback from cpp import keywords from cpp import tokenize from cpp import utils if not hasattr(builtins, 'reversed'): # Support Python 2.3 and earlier. def reversed(seq): for i in range(len(seq)-1, -1, -1): yield seq[i] if not hasattr(builtins, 'next'): # Support Python 2.5 and earlier. def next(obj): return obj.next() VISIBILITY_PUBLIC, VISIBILITY_PROTECTED, VISIBILITY_PRIVATE = range(3) FUNCTION_NONE = 0x00 FUNCTION_CONST = 0x01 FUNCTION_VIRTUAL = 0x02 FUNCTION_PURE_VIRTUAL = 0x04 FUNCTION_CTOR = 0x08 FUNCTION_DTOR = 0x10 FUNCTION_ATTRIBUTE = 0x20 FUNCTION_UNKNOWN_ANNOTATION = 0x40 FUNCTION_THROW = 0x80 FUNCTION_OVERRIDE = 0x100 """ These are currently unused. Should really handle these properly at some point. TYPE_MODIFIER_INLINE = 0x010000 TYPE_MODIFIER_EXTERN = 0x020000 TYPE_MODIFIER_STATIC = 0x040000 TYPE_MODIFIER_CONST = 0x080000 TYPE_MODIFIER_REGISTER = 0x100000 TYPE_MODIFIER_VOLATILE = 0x200000 TYPE_MODIFIER_MUTABLE = 0x400000 TYPE_MODIFIER_MAP = { 'inline': TYPE_MODIFIER_INLINE, 'extern': TYPE_MODIFIER_EXTERN, 'static': TYPE_MODIFIER_STATIC, 'const': TYPE_MODIFIER_CONST, 'register': TYPE_MODIFIER_REGISTER, 'volatile': TYPE_MODIFIER_VOLATILE, 'mutable': TYPE_MODIFIER_MUTABLE, } """ _INTERNAL_TOKEN = 'internal' _NAMESPACE_POP = 'ns-pop' # TODO(nnorwitz): use this as a singleton for templated_types, etc # where we don't want to create a new empty dict each time. It is also const. class _NullDict(object): __contains__ = lambda self: False keys = values = items = iterkeys = itervalues = iteritems = lambda self: () # TODO(nnorwitz): move AST nodes into a separate module. class Node(object): """Base AST node.""" def __init__(self, start, end): self.start = start self.end = end def IsDeclaration(self): """Returns bool if this node is a declaration.""" return False def IsDefinition(self): """Returns bool if this node is a definition.""" return False def IsExportable(self): """Returns bool if this node exportable from a header file.""" return False def Requires(self, node): """Does this AST node require the definition of the node passed in?""" return False def XXX__str__(self): return self._StringHelper(self.__class__.__name__, '') def _StringHelper(self, name, suffix): if not utils.DEBUG: return '%s(%s)' % (name, suffix) return '%s(%d, %d, %s)' % (name, self.start, self.end, suffix) def __repr__(self): return str(self) class Define(Node): def __init__(self, start, end, name, definition): Node.__init__(self, start, end) self.name = name self.definition = definition def __str__(self): value = '%s %s' % (self.name, self.definition) return self._StringHelper(self.__class__.__name__, value) class Include(Node): def __init__(self, start, end, filename, system): Node.__init__(self, start, end) self.filename = filename self.system = system def __str__(self): fmt = '"%s"' if self.system: fmt = '<%s>' return self._StringHelper(self.__class__.__name__, fmt % self.filename) class Goto(Node): def __init__(self, start, end, label): Node.__init__(self, start, end) self.label = label def __str__(self): return self._StringHelper(self.__class__.__name__, str(self.label)) class Expr(Node): def __init__(self, start, end, expr): Node.__init__(self, start, end) self.expr = expr def Requires(self, node): # TODO(nnorwitz): impl. return False def __str__(self): return self._StringHelper(self.__class__.__name__, str(self.expr)) class Return(Expr): pass class Delete(Expr): pass class Friend(Expr): def __init__(self, start, end, expr, namespace): Expr.__init__(self, start, end, expr) self.namespace = namespace[:] class Using(Node): def __init__(self, start, end, names): Node.__init__(self, start, end) self.names = names def __str__(self): return self._StringHelper(self.__class__.__name__, str(self.names)) class Parameter(Node): def __init__(self, start, end, name, parameter_type, default): Node.__init__(self, start, end) self.name = name self.type = parameter_type self.default = default def Requires(self, node): # TODO(nnorwitz): handle namespaces, etc. return self.type.name == node.name def __str__(self): name = str(self.type) suffix = '%s %s' % (name, self.name) if self.default: suffix += ' = ' + ''.join([d.name for d in self.default]) return self._StringHelper(self.__class__.__name__, suffix) class _GenericDeclaration(Node): def __init__(self, start, end, name, namespace): Node.__init__(self, start, end) self.name = name self.namespace = namespace[:] def FullName(self): prefix = '' if self.namespace and self.namespace[-1]: prefix = '::'.join(self.namespace) + '::' return prefix + self.name def _TypeStringHelper(self, suffix): if self.namespace: names = [n or '' for n in self.namespace] suffix += ' in ' + '::'.join(names) return self._StringHelper(self.__class__.__name__, suffix) # TODO(nnorwitz): merge with Parameter in some way? class VariableDeclaration(_GenericDeclaration): def __init__(self, start, end, name, var_type, initial_value, namespace): _GenericDeclaration.__init__(self, start, end, name, namespace) self.type = var_type self.initial_value = initial_value def Requires(self, node): # TODO(nnorwitz): handle namespaces, etc. return self.type.name == node.name def ToString(self): """Return a string that tries to reconstitute the variable decl.""" suffix = '%s %s' % (self.type, self.name) if self.initial_value: suffix += ' = ' + self.initial_value return suffix def __str__(self): return self._StringHelper(self.__class__.__name__, self.ToString()) class Typedef(_GenericDeclaration): def __init__(self, start, end, name, alias, namespace): _GenericDeclaration.__init__(self, start, end, name, namespace) self.alias = alias def IsDefinition(self): return True def IsExportable(self): return True def Requires(self, node): # TODO(nnorwitz): handle namespaces, etc. name = node.name for token in self.alias: if token is not None and name == token.name: return True return False def __str__(self): suffix = '%s, %s' % (self.name, self.alias) return self._TypeStringHelper(suffix) class _NestedType(_GenericDeclaration): def __init__(self, start, end, name, fields, namespace): _GenericDeclaration.__init__(self, start, end, name, namespace) self.fields = fields def IsDefinition(self): return True def IsExportable(self): return True def __str__(self): suffix = '%s, {%s}' % (self.name, self.fields) return self._TypeStringHelper(suffix) class Union(_NestedType): pass class Enum(_NestedType): pass class Class(_GenericDeclaration): def __init__(self, start, end, name, bases, templated_types, body, namespace): _GenericDeclaration.__init__(self, start, end, name, namespace) self.bases = bases self.body = body self.templated_types = templated_types def IsDeclaration(self): return self.bases is None and self.body is None def IsDefinition(self): return not self.IsDeclaration() def IsExportable(self): return not self.IsDeclaration() def Requires(self, node): # TODO(nnorwitz): handle namespaces, etc. if self.bases: for token_list in self.bases: # TODO(nnorwitz): bases are tokens, do name comparision. for token in token_list: if token.name == node.name: return True # TODO(nnorwitz): search in body too. return False def __str__(self): name = self.name if self.templated_types: name += '<%s>' % self.templated_types suffix = '%s, %s, %s' % (name, self.bases, self.body) return self._TypeStringHelper(suffix) class Struct(Class): pass class Function(_GenericDeclaration): def __init__(self, start, end, name, return_type, parameters, modifiers, templated_types, body, namespace): _GenericDeclaration.__init__(self, start, end, name, namespace) converter = TypeConverter(namespace) self.return_type = converter.CreateReturnType(return_type) self.parameters = converter.ToParameters(parameters) self.modifiers = modifiers self.body = body self.templated_types = templated_types def IsDeclaration(self): return self.body is None def IsDefinition(self): return self.body is not None def IsExportable(self): if self.return_type and 'static' in self.return_type.modifiers: return False return None not in self.namespace def Requires(self, node): if self.parameters: # TODO(nnorwitz): parameters are tokens, do name comparision. for p in self.parameters: if p.name == node.name: return True # TODO(nnorwitz): search in body too. return False def __str__(self): # TODO(nnorwitz): add templated_types. suffix = ('%s %s(%s), 0x%02x, %s' % (self.return_type, self.name, self.parameters, self.modifiers, self.body)) return self._TypeStringHelper(suffix) class Method(Function): def __init__(self, start, end, name, in_class, return_type, parameters, modifiers, templated_types, body, namespace): Function.__init__(self, start, end, name, return_type, parameters, modifiers, templated_types, body, namespace) # TODO(nnorwitz): in_class could also be a namespace which can # mess up finding functions properly. self.in_class = in_class class Type(_GenericDeclaration): """Type used for any variable (eg class, primitive, struct, etc).""" def __init__(self, start, end, name, templated_types, modifiers, reference, pointer, array): """ Args: name: str name of main type templated_types: [Class (Type?)] template type info between <> modifiers: [str] type modifiers (keywords) eg, const, mutable, etc. reference, pointer, array: bools """ _GenericDeclaration.__init__(self, start, end, name, []) self.templated_types = templated_types if not name and modifiers: self.name = modifiers.pop() self.modifiers = modifiers self.reference = reference self.pointer = pointer self.array = array def __str__(self): prefix = '' if self.modifiers: prefix = ' '.join(self.modifiers) + ' ' name = str(self.name) if self.templated_types: name += '<%s>' % self.templated_types suffix = prefix + name if self.reference: suffix += '&' if self.pointer: suffix += '*' if self.array: suffix += '[]' return self._TypeStringHelper(suffix) # By definition, Is* are always False. A Type can only exist in # some sort of variable declaration, parameter, or return value. def IsDeclaration(self): return False def IsDefinition(self): return False def IsExportable(self): return False class TypeConverter(object): def __init__(self, namespace_stack): self.namespace_stack = namespace_stack def _GetTemplateEnd(self, tokens, start): count = 1 end = start while 1: token = tokens[end] end += 1 if token.name == '<': count += 1 elif token.name == '>': count -= 1 if count == 0: break return tokens[start:end-1], end def ToType(self, tokens): """Convert [Token,...] to [Class(...), ] useful for base classes. For example, code like class Foo : public Bar { ... }; the "Bar" portion gets converted to an AST. Returns: [Class(...), ...] """ result = [] name_tokens = [] reference = pointer = array = False def AddType(templated_types): # Partition tokens into name and modifier tokens. names = [] modifiers = [] for t in name_tokens: if keywords.IsKeyword(t.name): modifiers.append(t.name) else: names.append(t.name) name = ''.join(names) if name_tokens: result.append(Type(name_tokens[0].start, name_tokens[-1].end, name, templated_types, modifiers, reference, pointer, array)) del name_tokens[:] i = 0 end = len(tokens) while i < end: token = tokens[i] if token.name == '<': new_tokens, new_end = self._GetTemplateEnd(tokens, i+1) AddType(self.ToType(new_tokens)) # If there is a comma after the template, we need to consume # that here otherwise it becomes part of the name. i = new_end reference = pointer = array = False elif token.name == ',': AddType([]) reference = pointer = array = False elif token.name == '*': pointer = True elif token.name == '&': reference = True elif token.name == '[': pointer = True elif token.name == ']': pass else: name_tokens.append(token) i += 1 if name_tokens: # No '<' in the tokens, just a simple name and no template. AddType([]) return result def DeclarationToParts(self, parts, needs_name_removed): name = None default = [] if needs_name_removed: # Handle default (initial) values properly. for i, t in enumerate(parts): if t.name == '=': default = parts[i+1:] name = parts[i-1].name if name == ']' and parts[i-2].name == '[': name = parts[i-3].name i -= 1 parts = parts[:i-1] break else: if parts[-1].token_type == tokenize.NAME: name = parts.pop().name else: # TODO(nnorwitz): this is a hack that happens for code like # Register(Foo); where it thinks this is a function call # but it's actually a declaration. name = '???' modifiers = [] type_name = [] other_tokens = [] templated_types = [] i = 0 end = len(parts) while i < end: p = parts[i] if keywords.IsKeyword(p.name): modifiers.append(p.name) elif p.name == '<': templated_tokens, new_end = self._GetTemplateEnd(parts, i+1) templated_types = self.ToType(templated_tokens) i = new_end - 1 # Don't add a spurious :: to data members being initialized. next_index = i + 1 if next_index < end and parts[next_index].name == '::': i += 1 elif p.name in ('[', ']', '='): # These are handled elsewhere. other_tokens.append(p) elif p.name not in ('*', '&', '>'): # Ensure that names have a space between them. if (type_name and type_name[-1].token_type == tokenize.NAME and p.token_type == tokenize.NAME): type_name.append(tokenize.Token(tokenize.SYNTAX, ' ', 0, 0)) type_name.append(p) else: other_tokens.append(p) i += 1 type_name = ''.join([t.name for t in type_name]) return name, type_name, templated_types, modifiers, default, other_tokens def ToParameters(self, tokens): if not tokens: return [] result = [] name = type_name = '' type_modifiers = [] pointer = reference = array = False first_token = None default = [] def AddParameter(end): if default: del default[0] # Remove flag. parts = self.DeclarationToParts(type_modifiers, True) (name, type_name, templated_types, modifiers, unused_default, unused_other_tokens) = parts parameter_type = Type(first_token.start, first_token.end, type_name, templated_types, modifiers, reference, pointer, array) p = Parameter(first_token.start, end, name, parameter_type, default) result.append(p) template_count = 0 brace_count = 0 for s in tokens: if not first_token: first_token = s # Check for braces before templates, as we can have unmatched '<>' # inside default arguments. if s.name == '{': brace_count += 1 elif s.name == '}': brace_count -= 1 if brace_count > 0: type_modifiers.append(s) continue if s.name == '<': template_count += 1 elif s.name == '>': template_count -= 1 if template_count > 0: type_modifiers.append(s) continue if s.name == ',': AddParameter(s.start) name = type_name = '' type_modifiers = [] pointer = reference = array = False first_token = None default = [] elif s.name == '*': pointer = True elif s.name == '&': reference = True elif s.name == '[': array = True elif s.name == ']': pass # Just don't add to type_modifiers. elif s.name == '=': # Got a default value. Add any value (None) as a flag. default.append(None) elif default: default.append(s) else: type_modifiers.append(s) AddParameter(tokens[-1].end) return result def CreateReturnType(self, return_type_seq): if not return_type_seq: return None start = return_type_seq[0].start end = return_type_seq[-1].end _, name, templated_types, modifiers, default, other_tokens = \ self.DeclarationToParts(return_type_seq, False) names = [n.name for n in other_tokens] reference = '&' in names pointer = '*' in names array = '[' in names return Type(start, end, name, templated_types, modifiers, reference, pointer, array) def GetTemplateIndices(self, names): # names is a list of strings. start = names.index('<') end = len(names) - 1 while end > 0: if names[end] == '>': break end -= 1 return start, end+1 class AstBuilder(object): def __init__(self, token_stream, filename, in_class='', visibility=None, namespace_stack=[]): self.tokens = token_stream self.filename = filename # TODO(nnorwitz): use a better data structure (deque) for the queue. # Switching directions of the "queue" improved perf by about 25%. # Using a deque should be even better since we access from both sides. self.token_queue = [] self.namespace_stack = namespace_stack[:] self.in_class = in_class if in_class is None: self.in_class_name_only = None else: self.in_class_name_only = in_class.split('::')[-1] self.visibility = visibility self.in_function = False self.current_token = None # Keep the state whether we are currently handling a typedef or not. self._handling_typedef = False self.converter = TypeConverter(self.namespace_stack) def HandleError(self, msg, token): printable_queue = list(reversed(self.token_queue[-20:])) sys.stderr.write('Got %s in %s @ %s %s\n' % (msg, self.filename, token, printable_queue)) def Generate(self): while 1: token = self._GetNextToken() if not token: break # Get the next token. self.current_token = token # Dispatch on the next token type. if token.token_type == _INTERNAL_TOKEN: if token.name == _NAMESPACE_POP: self.namespace_stack.pop() continue try: result = self._GenerateOne(token) if result is not None: yield result except: self.HandleError('exception', token) raise def _CreateVariable(self, pos_token, name, type_name, type_modifiers, ref_pointer_name_seq, templated_types, value=None): reference = '&' in ref_pointer_name_seq pointer = '*' in ref_pointer_name_seq array = '[' in ref_pointer_name_seq var_type = Type(pos_token.start, pos_token.end, type_name, templated_types, type_modifiers, reference, pointer, array) return VariableDeclaration(pos_token.start, pos_token.end, name, var_type, value, self.namespace_stack) def _GenerateOne(self, token): if token.token_type == tokenize.NAME: if (keywords.IsKeyword(token.name) and not keywords.IsBuiltinType(token.name)): if token.name == 'enum': # Pop the next token and only put it back if it's not # 'class'. This allows us to support the two-token # 'enum class' keyword as if it were simply 'enum'. next = self._GetNextToken() if next.name != 'class': self._AddBackToken(next) method = getattr(self, 'handle_' + token.name) return method() elif token.name == self.in_class_name_only: # The token name is the same as the class, must be a ctor if # there is a paren. Otherwise, it's the return type. # Peek ahead to get the next token to figure out which. next = self._GetNextToken() self._AddBackToken(next) if next.token_type == tokenize.SYNTAX and next.name == '(': return self._GetMethod([token], FUNCTION_CTOR, None, True) # Fall through--handle like any other method. # Handle data or function declaration/definition. syntax = tokenize.SYNTAX temp_tokens, last_token = \ self._GetVarTokensUpToIgnoringTemplates(syntax, '(', ';', '{', '[') temp_tokens.insert(0, token) if last_token.name == '(': # If there is an assignment before the paren, # this is an expression, not a method. expr = bool([e for e in temp_tokens if e.name == '=']) if expr: new_temp = self._GetTokensUpTo(tokenize.SYNTAX, ';') temp_tokens.append(last_token) temp_tokens.extend(new_temp) last_token = tokenize.Token(tokenize.SYNTAX, ';', 0, 0) if last_token.name == '[': # Handle array, this isn't a method, unless it's an operator. # TODO(nnorwitz): keep the size somewhere. # unused_size = self._GetTokensUpTo(tokenize.SYNTAX, ']') temp_tokens.append(last_token) if temp_tokens[-2].name == 'operator': temp_tokens.append(self._GetNextToken()) else: temp_tokens2, last_token = \ self._GetVarTokensUpTo(tokenize.SYNTAX, ';') temp_tokens.extend(temp_tokens2) if last_token.name == ';': # Handle data, this isn't a method. parts = self.converter.DeclarationToParts(temp_tokens, True) (name, type_name, templated_types, modifiers, default, unused_other_tokens) = parts t0 = temp_tokens[0] names = [t.name for t in temp_tokens] if templated_types: start, end = self.converter.GetTemplateIndices(names) names = names[:start] + names[end:] default = ''.join([t.name for t in default]) return self._CreateVariable(t0, name, type_name, modifiers, names, templated_types, default) if last_token.name == '{': self._AddBackTokens(temp_tokens[1:]) self._AddBackToken(last_token) method_name = temp_tokens[0].name method = getattr(self, 'handle_' + method_name, None) if not method: # Must be declaring a variable. # TODO(nnorwitz): handle the declaration. return None return method() return self._GetMethod(temp_tokens, 0, None, False) elif token.token_type == tokenize.SYNTAX: if token.name == '~' and self.in_class: # Must be a dtor (probably not in method body). token = self._GetNextToken() # self.in_class can contain A::Name, but the dtor will only # be Name. Make sure to compare against the right value. if (token.token_type == tokenize.NAME and token.name == self.in_class_name_only): return self._GetMethod([token], FUNCTION_DTOR, None, True) # TODO(nnorwitz): handle a lot more syntax. elif token.token_type == tokenize.PREPROCESSOR: # TODO(nnorwitz): handle more preprocessor directives. # token starts with a #, so remove it and strip whitespace. name = token.name[1:].lstrip() if name.startswith('include'): # Remove "include". name = name[7:].strip() assert name # Handle #include \ "header-on-second-line.h". if name.startswith('\\'): name = name[1:].strip() assert name[0] in '<"', token assert name[-1] in '>"', token system = name[0] == '<' filename = name[1:-1] return Include(token.start, token.end, filename, system) if name.startswith('define'): # Remove "define". name = name[6:].strip() assert name value = '' for i, c in enumerate(name): if c.isspace(): value = name[i:].lstrip() name = name[:i] break return Define(token.start, token.end, name, value) if name.startswith('if') and name[2:3].isspace(): condition = name[3:].strip() if condition.startswith('0') or condition.startswith('(0)'): self._SkipIf0Blocks() return None def _GetTokensUpTo(self, expected_token_type, expected_token): return self._GetVarTokensUpTo(expected_token_type, expected_token)[0] def _GetVarTokensUpTo(self, expected_token_type, *expected_tokens): last_token = self._GetNextToken() tokens = [] while (last_token.token_type != expected_token_type or last_token.name not in expected_tokens): tokens.append(last_token) last_token = self._GetNextToken() return tokens, last_token # Same as _GetVarTokensUpTo, but skips over '<...>' which could contain an # expected token. def _GetVarTokensUpToIgnoringTemplates(self, expected_token_type, *expected_tokens): last_token = self._GetNextToken() tokens = [] nesting = 0 while (nesting > 0 or last_token.token_type != expected_token_type or last_token.name not in expected_tokens): tokens.append(last_token) last_token = self._GetNextToken() if last_token.name == '<': nesting += 1 elif last_token.name == '>': nesting -= 1 return tokens, last_token # TODO(nnorwitz): remove _IgnoreUpTo() it shouldn't be necesary. def _IgnoreUpTo(self, token_type, token): unused_tokens = self._GetTokensUpTo(token_type, token) def _SkipIf0Blocks(self): count = 1 while 1: token = self._GetNextToken() if token.token_type != tokenize.PREPROCESSOR: continue name = token.name[1:].lstrip() if name.startswith('endif'): count -= 1 if count == 0: break elif name.startswith('if'): count += 1 def _GetMatchingChar(self, open_paren, close_paren, GetNextToken=None): if GetNextToken is None: GetNextToken = self._GetNextToken # Assumes the current token is open_paren and we will consume # and return up to the close_paren. count = 1 token = GetNextToken() while 1: if token.token_type == tokenize.SYNTAX: if token.name == open_paren: count += 1 elif token.name == close_paren: count -= 1 if count == 0: break yield token token = GetNextToken() yield token def _GetParameters(self): return self._GetMatchingChar('(', ')') def GetScope(self): return self._GetMatchingChar('{', '}') def _GetNextToken(self): if self.token_queue: return self.token_queue.pop() try: return next(self.tokens) except StopIteration: return def _AddBackToken(self, token): if token.whence == tokenize.WHENCE_STREAM: token.whence = tokenize.WHENCE_QUEUE self.token_queue.insert(0, token) else: assert token.whence == tokenize.WHENCE_QUEUE, token self.token_queue.append(token) def _AddBackTokens(self, tokens): if tokens: if tokens[-1].whence == tokenize.WHENCE_STREAM: for token in tokens: token.whence = tokenize.WHENCE_QUEUE self.token_queue[:0] = reversed(tokens) else: assert tokens[-1].whence == tokenize.WHENCE_QUEUE, tokens self.token_queue.extend(reversed(tokens)) def GetName(self, seq=None): """Returns ([tokens], next_token_info).""" GetNextToken = self._GetNextToken if seq is not None: it = iter(seq) GetNextToken = lambda: next(it) next_token = GetNextToken() tokens = [] last_token_was_name = False while (next_token.token_type == tokenize.NAME or (next_token.token_type == tokenize.SYNTAX and next_token.name in ('::', '<'))): # Two NAMEs in a row means the identifier should terminate. # It's probably some sort of variable declaration. if last_token_was_name and next_token.token_type == tokenize.NAME: break last_token_was_name = next_token.token_type == tokenize.NAME tokens.append(next_token) # Handle templated names. if next_token.name == '<': tokens.extend(self._GetMatchingChar('<', '>', GetNextToken)) last_token_was_name = True next_token = GetNextToken() return tokens, next_token def GetMethod(self, modifiers, templated_types): return_type_and_name = self._GetTokensUpTo(tokenize.SYNTAX, '(') assert len(return_type_and_name) >= 1 return self._GetMethod(return_type_and_name, modifiers, templated_types, False) def _GetMethod(self, return_type_and_name, modifiers, templated_types, get_paren): template_portion = None if get_paren: token = self._GetNextToken() assert token.token_type == tokenize.SYNTAX, token if token.name == '<': # Handle templatized dtors. template_portion = [token] template_portion.extend(self._GetMatchingChar('<', '>')) token = self._GetNextToken() assert token.token_type == tokenize.SYNTAX, token assert token.name == '(', token name = return_type_and_name.pop() # Handle templatized ctors. if name.name == '>': index = 1 while return_type_and_name[index].name != '<': index += 1 template_portion = return_type_and_name[index:] + [name] del return_type_and_name[index:] name = return_type_and_name.pop() elif name.name == ']': rt = return_type_and_name assert rt[-1].name == '[', return_type_and_name assert rt[-2].name == 'operator', return_type_and_name name_seq = return_type_and_name[-2:] del return_type_and_name[-2:] name = tokenize.Token(tokenize.NAME, 'operator[]', name_seq[0].start, name.end) # Get the open paren so _GetParameters() below works. unused_open_paren = self._GetNextToken() # TODO(nnorwitz): store template_portion. return_type = return_type_and_name indices = name if return_type: indices = return_type[0] # Force ctor for templatized ctors. if name.name == self.in_class and not modifiers: modifiers |= FUNCTION_CTOR parameters = list(self._GetParameters()) del parameters[-1] # Remove trailing ')'. # Handling operator() is especially weird. if name.name == 'operator' and not parameters: token = self._GetNextToken() assert token.name == '(', token parameters = list(self._GetParameters()) del parameters[-1] # Remove trailing ')'. token = self._GetNextToken() while token.token_type == tokenize.NAME: modifier_token = token token = self._GetNextToken() if modifier_token.name == 'const': modifiers |= FUNCTION_CONST elif modifier_token.name == '__attribute__': # TODO(nnorwitz): handle more __attribute__ details. modifiers |= FUNCTION_ATTRIBUTE assert token.name == '(', token # Consume everything between the (parens). unused_tokens = list(self._GetMatchingChar('(', ')')) token = self._GetNextToken() elif modifier_token.name == 'throw': modifiers |= FUNCTION_THROW assert token.name == '(', token # Consume everything between the (parens). unused_tokens = list(self._GetMatchingChar('(', ')')) token = self._GetNextToken() elif modifier_token.name == 'override': modifiers |= FUNCTION_OVERRIDE elif modifier_token.name == modifier_token.name.upper(): # HACK(nnorwitz): assume that all upper-case names # are some macro we aren't expanding. modifiers |= FUNCTION_UNKNOWN_ANNOTATION else: self.HandleError('unexpected token', modifier_token) assert token.token_type == tokenize.SYNTAX, token # Handle ctor initializers. if token.name == ':': # TODO(nnorwitz): anything else to handle for initializer list? while token.name != ';' and token.name != '{': token = self._GetNextToken() # Handle pointer to functions that are really data but look # like method declarations. if token.name == '(': if parameters[0].name == '*': # name contains the return type. name = parameters.pop() # parameters contains the name of the data. modifiers = [p.name for p in parameters] # Already at the ( to open the parameter list. function_parameters = list(self._GetMatchingChar('(', ')')) del function_parameters[-1] # Remove trailing ')'. # TODO(nnorwitz): store the function_parameters. token = self._GetNextToken() assert token.token_type == tokenize.SYNTAX, token assert token.name == ';', token return self._CreateVariable(indices, name.name, indices.name, modifiers, '', None) # At this point, we got something like: # return_type (type::*name_)(params); # This is a data member called name_ that is a function pointer. # With this code: void (sq_type::*field_)(string&); # We get: name=void return_type=[] parameters=sq_type ... field_ # TODO(nnorwitz): is return_type always empty? # TODO(nnorwitz): this isn't even close to being correct. # Just put in something so we don't crash and can move on. real_name = parameters[-1] modifiers = [p.name for p in self._GetParameters()] del modifiers[-1] # Remove trailing ')'. return self._CreateVariable(indices, real_name.name, indices.name, modifiers, '', None) if token.name == '{': body = list(self.GetScope()) del body[-1] # Remove trailing '}'. else: body = None if token.name == '=': token = self._GetNextToken() if token.name == 'default' or token.name == 'delete': # Ignore explicitly defaulted and deleted special members # in C++11. token = self._GetNextToken() else: # Handle pure-virtual declarations. assert token.token_type == tokenize.CONSTANT, token assert token.name == '0', token modifiers |= FUNCTION_PURE_VIRTUAL token = self._GetNextToken() if token.name == '[': # TODO(nnorwitz): store tokens and improve parsing. # template char (&ASH(T (&seq)[N]))[N]; tokens = list(self._GetMatchingChar('[', ']')) token = self._GetNextToken() assert token.name == ';', (token, return_type_and_name, parameters) # Looks like we got a method, not a function. if len(return_type) > 2 and return_type[-1].name == '::': return_type, in_class = \ self._GetReturnTypeAndClassName(return_type) return Method(indices.start, indices.end, name.name, in_class, return_type, parameters, modifiers, templated_types, body, self.namespace_stack) return Function(indices.start, indices.end, name.name, return_type, parameters, modifiers, templated_types, body, self.namespace_stack) def _GetReturnTypeAndClassName(self, token_seq): # Splitting the return type from the class name in a method # can be tricky. For example, Return::Type::Is::Hard::To::Find(). # Where is the return type and where is the class name? # The heuristic used is to pull the last name as the class name. # This includes all the templated type info. # TODO(nnorwitz): if there is only One name like in the # example above, punt and assume the last bit is the class name. # Ignore a :: prefix, if exists so we can find the first real name. i = 0 if token_seq[0].name == '::': i = 1 # Ignore a :: suffix, if exists. end = len(token_seq) - 1 if token_seq[end-1].name == '::': end -= 1 # Make a copy of the sequence so we can append a sentinel # value. This is required for GetName will has to have some # terminating condition beyond the last name. seq_copy = token_seq[i:end] seq_copy.append(tokenize.Token(tokenize.SYNTAX, '', 0, 0)) names = [] while i < end: # Iterate through the sequence parsing out each name. new_name, next = self.GetName(seq_copy[i:]) assert new_name, 'Got empty new_name, next=%s' % next # We got a pointer or ref. Add it to the name. if next and next.token_type == tokenize.SYNTAX: new_name.append(next) names.append(new_name) i += len(new_name) # Now that we have the names, it's time to undo what we did. # Remove the sentinel value. names[-1].pop() # Flatten the token sequence for the return type. return_type = [e for seq in names[:-1] for e in seq] # The class name is the last name. class_name = names[-1] return return_type, class_name def handle_bool(self): pass def handle_char(self): pass def handle_int(self): pass def handle_long(self): pass def handle_short(self): pass def handle_double(self): pass def handle_float(self): pass def handle_void(self): pass def handle_wchar_t(self): pass def handle_unsigned(self): pass def handle_signed(self): pass def _GetNestedType(self, ctor): name = None name_tokens, token = self.GetName() if name_tokens: name = ''.join([t.name for t in name_tokens]) # Handle forward declarations. if token.token_type == tokenize.SYNTAX and token.name == ';': return ctor(token.start, token.end, name, None, self.namespace_stack) if token.token_type == tokenize.NAME and self._handling_typedef: self._AddBackToken(token) return ctor(token.start, token.end, name, None, self.namespace_stack) # Must be the type declaration. fields = list(self._GetMatchingChar('{', '}')) del fields[-1] # Remove trailing '}'. if token.token_type == tokenize.SYNTAX and token.name == '{': next = self._GetNextToken() new_type = ctor(token.start, token.end, name, fields, self.namespace_stack) # A name means this is an anonymous type and the name # is the variable declaration. if next.token_type != tokenize.NAME: return new_type name = new_type token = next # Must be variable declaration using the type prefixed with keyword. assert token.token_type == tokenize.NAME, token return self._CreateVariable(token, token.name, name, [], '', None) def handle_struct(self): # Special case the handling typedef/aliasing of structs here. # It would be a pain to handle in the class code. name_tokens, var_token = self.GetName() if name_tokens: next_token = self._GetNextToken() is_syntax = (var_token.token_type == tokenize.SYNTAX and var_token.name[0] in '*&') is_variable = (var_token.token_type == tokenize.NAME and next_token.name == ';') variable = var_token if is_syntax and not is_variable: variable = next_token temp = self._GetNextToken() if temp.token_type == tokenize.SYNTAX and temp.name == '(': # Handle methods declared to return a struct. t0 = name_tokens[0] struct = tokenize.Token(tokenize.NAME, 'struct', t0.start-7, t0.start-2) type_and_name = [struct] type_and_name.extend(name_tokens) type_and_name.extend((var_token, next_token)) return self._GetMethod(type_and_name, 0, None, False) assert temp.name == ';', (temp, name_tokens, var_token) if is_syntax or (is_variable and not self._handling_typedef): modifiers = ['struct'] type_name = ''.join([t.name for t in name_tokens]) position = name_tokens[0] return self._CreateVariable(position, variable.name, type_name, modifiers, var_token.name, None) name_tokens.extend((var_token, next_token)) self._AddBackTokens(name_tokens) else: self._AddBackToken(var_token) return self._GetClass(Struct, VISIBILITY_PUBLIC, None) def handle_union(self): return self._GetNestedType(Union) def handle_enum(self): return self._GetNestedType(Enum) def handle_auto(self): # TODO(nnorwitz): warn about using auto? Probably not since it # will be reclaimed and useful for C++0x. pass def handle_register(self): pass def handle_const(self): pass def handle_inline(self): pass def handle_extern(self): pass def handle_static(self): pass def handle_virtual(self): # What follows must be a method. token = token2 = self._GetNextToken() if token.name == 'inline': # HACK(nnorwitz): handle inline dtors by ignoring 'inline'. token2 = self._GetNextToken() if token2.token_type == tokenize.SYNTAX and token2.name == '~': return self.GetMethod(FUNCTION_VIRTUAL + FUNCTION_DTOR, None) assert token.token_type == tokenize.NAME or token.name == '::', token return_type_and_name, _ = self._GetVarTokensUpToIgnoringTemplates( tokenize.SYNTAX, '(') # ) return_type_and_name.insert(0, token) if token2 is not token: return_type_and_name.insert(1, token2) return self._GetMethod(return_type_and_name, FUNCTION_VIRTUAL, None, False) def handle_volatile(self): pass def handle_mutable(self): pass def handle_public(self): assert self.in_class self.visibility = VISIBILITY_PUBLIC def handle_protected(self): assert self.in_class self.visibility = VISIBILITY_PROTECTED def handle_private(self): assert self.in_class self.visibility = VISIBILITY_PRIVATE def handle_friend(self): tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';') assert tokens t0 = tokens[0] return Friend(t0.start, t0.end, tokens, self.namespace_stack) def handle_static_cast(self): pass def handle_const_cast(self): pass def handle_dynamic_cast(self): pass def handle_reinterpret_cast(self): pass def handle_new(self): pass def handle_delete(self): tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';') assert tokens return Delete(tokens[0].start, tokens[0].end, tokens) def handle_typedef(self): token = self._GetNextToken() if (token.token_type == tokenize.NAME and keywords.IsKeyword(token.name)): # Token must be struct/enum/union/class. method = getattr(self, 'handle_' + token.name) self._handling_typedef = True tokens = [method()] self._handling_typedef = False else: tokens = [token] # Get the remainder of the typedef up to the semi-colon. tokens.extend(self._GetTokensUpTo(tokenize.SYNTAX, ';')) # TODO(nnorwitz): clean all this up. assert tokens name = tokens.pop() indices = name if tokens: indices = tokens[0] if not indices: indices = token if name.name == ')': # HACK(nnorwitz): Handle pointers to functions "properly". if (len(tokens) >= 4 and tokens[1].name == '(' and tokens[2].name == '*'): tokens.append(name) name = tokens[3] elif name.name == ']': # HACK(nnorwitz): Handle arrays properly. if len(tokens) >= 2: tokens.append(name) name = tokens[1] new_type = tokens if tokens and isinstance(tokens[0], tokenize.Token): new_type = self.converter.ToType(tokens)[0] return Typedef(indices.start, indices.end, name.name, new_type, self.namespace_stack) def handle_typeid(self): pass # Not needed yet. def handle_typename(self): pass # Not needed yet. def _GetTemplatedTypes(self): result = collections.OrderedDict() tokens = list(self._GetMatchingChar('<', '>')) len_tokens = len(tokens) - 1 # Ignore trailing '>'. i = 0 while i < len_tokens: key = tokens[i].name i += 1 if keywords.IsKeyword(key) or key == ',': continue type_name = default = None if i < len_tokens: i += 1 if tokens[i-1].name == '=': assert i < len_tokens, '%s %s' % (i, tokens) default, unused_next_token = self.GetName(tokens[i:]) i += len(default) else: if tokens[i-1].name != ',': # We got something like: Type variable. # Re-adjust the key (variable) and type_name (Type). key = tokens[i-1].name type_name = tokens[i-2] result[key] = (type_name, default) return result def handle_template(self): token = self._GetNextToken() assert token.token_type == tokenize.SYNTAX, token assert token.name == '<', token templated_types = self._GetTemplatedTypes() # TODO(nnorwitz): for now, just ignore the template params. token = self._GetNextToken() if token.token_type == tokenize.NAME: if token.name == 'class': return self._GetClass(Class, VISIBILITY_PRIVATE, templated_types) elif token.name == 'struct': return self._GetClass(Struct, VISIBILITY_PUBLIC, templated_types) elif token.name == 'friend': return self.handle_friend() self._AddBackToken(token) tokens, last = self._GetVarTokensUpTo(tokenize.SYNTAX, '(', ';') tokens.append(last) self._AddBackTokens(tokens) if last.name == '(': return self.GetMethod(FUNCTION_NONE, templated_types) # Must be a variable definition. return None def handle_true(self): pass # Nothing to do. def handle_false(self): pass # Nothing to do. def handle_asm(self): pass # Not needed yet. def handle_class(self): return self._GetClass(Class, VISIBILITY_PRIVATE, None) def _GetBases(self): # Get base classes. bases = [] while 1: token = self._GetNextToken() assert token.token_type == tokenize.NAME, token # TODO(nnorwitz): store kind of inheritance...maybe. if token.name not in ('public', 'protected', 'private'): # If inheritance type is not specified, it is private. # Just put the token back so we can form a name. # TODO(nnorwitz): it would be good to warn about this. self._AddBackToken(token) else: # Check for virtual inheritance. token = self._GetNextToken() if token.name != 'virtual': self._AddBackToken(token) else: # TODO(nnorwitz): store that we got virtual for this base. pass base, next_token = self.GetName() bases_ast = self.converter.ToType(base) assert len(bases_ast) == 1, bases_ast bases.append(bases_ast[0]) assert next_token.token_type == tokenize.SYNTAX, next_token if next_token.name == '{': token = next_token break # Support multiple inheritance. assert next_token.name == ',', next_token return bases, token def _GetClass(self, class_type, visibility, templated_types): class_name = None class_token = self._GetNextToken() if class_token.token_type != tokenize.NAME: assert class_token.token_type == tokenize.SYNTAX, class_token token = class_token else: # Skip any macro (e.g. storage class specifiers) after the # 'class' keyword. next_token = self._GetNextToken() if next_token.token_type == tokenize.NAME: self._AddBackToken(next_token) else: self._AddBackTokens([class_token, next_token]) name_tokens, token = self.GetName() class_name = ''.join([t.name for t in name_tokens]) bases = None if token.token_type == tokenize.SYNTAX: if token.name == ';': # Forward declaration. return class_type(class_token.start, class_token.end, class_name, None, templated_types, None, self.namespace_stack) if token.name in '*&': # Inline forward declaration. Could be method or data. name_token = self._GetNextToken() next_token = self._GetNextToken() if next_token.name == ';': # Handle data modifiers = ['class'] return self._CreateVariable(class_token, name_token.name, class_name, modifiers, token.name, None) else: # Assume this is a method. tokens = (class_token, token, name_token, next_token) self._AddBackTokens(tokens) return self.GetMethod(FUNCTION_NONE, None) if token.name == ':': bases, token = self._GetBases() body = None if token.token_type == tokenize.SYNTAX and token.name == '{': assert token.token_type == tokenize.SYNTAX, token assert token.name == '{', token ast = AstBuilder(self.GetScope(), self.filename, class_name, visibility, self.namespace_stack) body = list(ast.Generate()) if not self._handling_typedef: token = self._GetNextToken() if token.token_type != tokenize.NAME: assert token.token_type == tokenize.SYNTAX, token assert token.name == ';', token else: new_class = class_type(class_token.start, class_token.end, class_name, bases, None, body, self.namespace_stack) modifiers = [] return self._CreateVariable(class_token, token.name, new_class, modifiers, token.name, None) else: if not self._handling_typedef: self.HandleError('non-typedef token', token) self._AddBackToken(token) return class_type(class_token.start, class_token.end, class_name, bases, templated_types, body, self.namespace_stack) def handle_namespace(self): # Support anonymous namespaces. name = None name_tokens, token = self.GetName() if name_tokens: name = ''.join([t.name for t in name_tokens]) self.namespace_stack.append(name) assert token.token_type == tokenize.SYNTAX, token # Create an internal token that denotes when the namespace is complete. internal_token = tokenize.Token(_INTERNAL_TOKEN, _NAMESPACE_POP, None, None) internal_token.whence = token.whence if token.name == '=': # TODO(nnorwitz): handle aliasing namespaces. name, next_token = self.GetName() assert next_token.name == ';', next_token self._AddBackToken(internal_token) else: assert token.name == '{', token tokens = list(self.GetScope()) # Replace the trailing } with the internal namespace pop token. tokens[-1] = internal_token # Handle namespace with nothing in it. self._AddBackTokens(tokens) return None def handle_using(self): tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';') assert tokens return Using(tokens[0].start, tokens[0].end, tokens) def handle_explicit(self): assert self.in_class # Nothing much to do. # TODO(nnorwitz): maybe verify the method name == class name. # This must be a ctor. return self.GetMethod(FUNCTION_CTOR, None) def handle_this(self): pass # Nothing to do. def handle_operator(self): # Pull off the next token(s?) and make that part of the method name. pass def handle_sizeof(self): pass def handle_case(self): pass def handle_switch(self): pass def handle_default(self): token = self._GetNextToken() assert token.token_type == tokenize.SYNTAX assert token.name == ':' def handle_if(self): pass def handle_else(self): pass def handle_return(self): tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';') if not tokens: return Return(self.current_token.start, self.current_token.end, None) return Return(tokens[0].start, tokens[0].end, tokens) def handle_goto(self): tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';') assert len(tokens) == 1, str(tokens) return Goto(tokens[0].start, tokens[0].end, tokens[0].name) def handle_try(self): pass # Not needed yet. def handle_catch(self): pass # Not needed yet. def handle_throw(self): pass # Not needed yet. def handle_while(self): pass def handle_do(self): pass def handle_for(self): pass def handle_break(self): self._IgnoreUpTo(tokenize.SYNTAX, ';') def handle_continue(self): self._IgnoreUpTo(tokenize.SYNTAX, ';') def BuilderFromSource(source, filename): """Utility method that returns an AstBuilder from source code. Args: source: 'C++ source code' filename: 'file1' Returns: AstBuilder """ return AstBuilder(tokenize.GetTokens(source), filename) def PrintIndentifiers(filename, should_print): """Prints all identifiers for a C++ source file. Args: filename: 'file1' should_print: predicate with signature: bool Function(token) """ source = utils.ReadFile(filename, False) if source is None: sys.stderr.write('Unable to find: %s\n' % filename) return #print('Processing %s' % actual_filename) builder = BuilderFromSource(source, filename) try: for node in builder.Generate(): if should_print(node): print(node.name) except KeyboardInterrupt: return except: pass def PrintAllIndentifiers(filenames, should_print): """Prints all identifiers for each C++ source file in filenames. Args: filenames: ['file1', 'file2', ...] should_print: predicate with signature: bool Function(token) """ for path in filenames: PrintIndentifiers(path, should_print) def main(argv): for filename in argv[1:]: source = utils.ReadFile(filename) if source is None: continue print('Processing %s' % filename) builder = BuilderFromSource(source, filename) try: entire_ast = filter(None, builder.Generate()) except KeyboardInterrupt: return except: # Already printed a warning, print the traceback and continue. traceback.print_exc() else: if utils.DEBUG: for ast in entire_ast: print(ast) if __name__ == '__main__': main(sys.argv)