blob: 7056abf74a00e2e20b1597a510e50986211482be [file] [log] [blame]
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from __future__ import print_function
import parser
import symbol
import sys
import token
import tokenize
from py_utils.refactor import offset_token
class Snippet(object):
"""A node in the Python parse tree.
The Python grammar is defined at:
There are two types of Snippets:
TokenSnippets are leaf nodes containing actual text.
Symbols are internal nodes representing higher-level groupings, and are
defined by the left-hand sides of the BNFs in the above link.
def type(self):
raise NotImplementedError()
def type_name(self):
raise NotImplementedError()
def children(self):
"""Return a list of this node's children."""
raise NotImplementedError()
def tokens(self):
"""Return a tuple of the tokens this Snippet contains."""
raise NotImplementedError()
def PrintTree(self, indent=0, stream=sys.stdout):
"""Spew a pretty-printed parse tree. Mostly useful for debugging."""
raise NotImplementedError()
def __str__(self):
return offset_token.Untokenize(self.tokens)
def FindAll(self, snippet_type):
if isinstance(snippet_type, int):
if self.type == snippet_type:
yield self
if isinstance(self, snippet_type):
yield self
for child in self.children:
for snippet in child.FindAll(snippet_type):
yield snippet
def FindChild(self, snippet_type, **kwargs):
for child in self.children:
if isinstance(snippet_type, int):
if child.type != snippet_type:
if not isinstance(child, snippet_type):
for attribute, value in kwargs:
if getattr(child, attribute) != value:
return child
raise ValueError('%s is not in %s. Children are: %s' %
(snippet_type, self, self.children))
def FindChildren(self, snippet_type):
if isinstance(snippet_type, int):
for child in self.children:
if child.type == snippet_type:
yield child
for child in self.children:
if isinstance(child, snippet_type):
yield child
class TokenSnippet(Snippet):
"""A Snippet containing a list of tokens.
A list of tokens may start with any number of comments and non-terminating
newlines, but must end with a syntactically meaningful token.
def __init__(self, token_type, tokens):
# For operators and delimiters, the TokenSnippet's type may be more specific
# than the type of the constituent token. E.g. the TokenSnippet type is
# token.DOT, but the token type is token.OP. This is because the parser
# has more context than the tokenizer.
self._type = token_type
self._tokens = tokens
self._modified = False
def Create(cls, token_type, string, offset=(0, 0)):
return cls(token_type,
[offset_token.OffsetToken(token_type, string, offset)])
def type(self):
return self._type
def type_name(self):
return token.tok_name[self.type]
def value(self):
return self._tokens[-1].string
def value(self, value):
self._tokens[-1].string = value
self._modified = True
def children(self):
return []
def tokens(self):
return tuple(self._tokens)
def modified(self):
return self._modified
def PrintTree(self, indent=0, stream=sys.stdout):
stream.write(' ' * indent)
if not self.tokens:
print(self.type_name, file=stream)
print('%-4s' % self.type_name, repr(self.tokens[0].string), file=stream)
for tok in self.tokens[1:]:
stream.write(' ' * indent)
print(' ' * max(len(self.type_name), 4), repr(tok.string), file=stream)
class Symbol(Snippet):
"""A Snippet containing sub-Snippets.
The possible types and type_names are defined in Python's symbol module."""
def __init__(self, symbol_type, children):
self._type = symbol_type
self._children = children
def type(self):
return self._type
def type_name(self):
return symbol.sym_name[self.type]
def children(self):
return self._children
def children(self, value): # pylint: disable=arguments-differ
self._children = value
def tokens(self):
tokens = []
for child in self.children:
tokens += child.tokens
return tuple(tokens)
def modified(self):
return any(child.modified for child in self.children)
def PrintTree(self, indent=0, stream=sys.stdout):
stream.write(' ' * indent)
# If there's only one child, collapse it onto the same line.
node = self
while len(node.children) == 1 and len(node.children[0].children) == 1:
print(node.type_name, end=' ', file=stream)
node = node.children[0]
print(node.type_name, file=stream)
for child in node.children:
child.PrintTree(indent + 2, stream)
def Snippetize(f):
"""Return the syntax tree of the given file."""
syntax_tree = parser.st2list(parser.suite(
tokens = offset_token.Tokenize(f)
snippet = _SnippetizeNode(syntax_tree, tokens)
assert not tokens
return snippet
def _SnippetizeNode(node, tokens):
# The parser module gives a syntax tree that discards comments,
# non-terminating newlines, and whitespace information. Use the tokens given
# by the tokenize module to annotate the syntax tree with the information
# needed to exactly reproduce the original source code.
node_type = node[0]
if node_type >= token.NT_OFFSET:
# Symbol.
children = tuple(_SnippetizeNode(child, tokens) for child in node[1:])
return Symbol(node_type, children)
# Token.
grabbed_tokens = []
while tokens and (
tokens[0].type == tokenize.COMMENT or tokens[0].type == tokenize.NL):
# parser has 2 NEWLINEs right before the end.
# tokenize has 0 or 1 depending on if the file has one.
# Create extra nodes without consuming tokens to account for this.
if node_type == token.NEWLINE:
for tok in tokens:
if tok.type == token.ENDMARKER:
return TokenSnippet(node_type, grabbed_tokens)
if tok.type != token.DEDENT:
assert tokens[0].type == token.OP or node_type == tokens[0].type
return TokenSnippet(node_type, grabbed_tokens)