Files
jedi/jedi/parser/__init__.py

805 lines
32 KiB
Python

"""
The ``Parser`` tries to convert the available Python code in an easy to read
format, something like an abstract syntax tree. The classes who represent this
tree, are sitting in the :mod:`jedi.parser.representation` module.
The Python module ``tokenize`` is a very important part in the ``Parser``,
because it splits the code into different words (tokens). Sometimes it looks a
bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast``
module for this? Well, ``ast`` does a very good job understanding proper Python
code, but fails to work as soon as there's a single line of broken code.
There's one important optimization that needs to be known: Statements are not
being parsed completely. ``Statement`` is just a representation of the tokens
within the statement. This lowers memory usage and cpu time and reduces the
complexity of the ``Parser`` (there's another parser sitting inside
``Statement``, which produces ``Array`` and ``Call``).
"""
import keyword
import logging
from jedi._compatibility import next, unicode
from jedi import debug
from jedi import common
from jedi.parser import representation as pr
from jedi.parser import tokenize
from jedi.parser import pytree
from jedi.parser.pgen2 import Driver
from jedi.parser import pgen2
OPERATOR_KEYWORDS = 'and', 'for', 'if', 'else', 'in', 'is', 'lambda', 'not', 'or'
# Not used yet. In the future I intend to add something like KeywordStatement
STATEMENT_KEYWORDS = 'assert', 'del', 'global', 'nonlocal', 'raise', \
'return', 'yield', 'pass', 'continue', 'break'
class Parser(object):
"""
This class is used to parse a Python file, it then divides them into a
class structure of different scopes.
:param source: The codebase for the parser.
:type source: str
:param module_path: The path of the module in the file system, may be None.
:type module_path: str
:param no_docstr: If True, a string at the beginning is not a docstr.
:param top_module: Use this module as a parent instead of `self.module`.
"""
def __init__(self, source, module_path=None, no_docstr=False,
tokenizer=None, top_module=None):
if not source.endswith('\n'):
source += '\n'
_ast_mapping = {
'expr_stmt': pr.ExprStmt,
'classdef': pr.Class,
'funcdef': pr.Function,
'file_input': pr.SubModule,
'import_name': pr.Import,
'import_from': pr.Import,
'break_stmt': pr.KeywordStatement,
'continue_stmt': pr.KeywordStatement,
'return_stmt': pr.ReturnStmt,
'raise_stmt': pr.KeywordStatement,
'yield_expr': pr.YieldExpr,
'del_stmt': pr.KeywordStatement,
'pass_stmt': pr.KeywordStatement,
'global_stmt': pr.GlobalStmt,
'nonlocal_stmt': pr.KeywordStatement,
'assert_stmt': pr.KeywordStatement,
'if_stmt': pr.IfStmt,
'with_stmt': pr.WithStmt,
'for_stmt': pr.ForStmt,
'while_stmt': pr.WhileStmt,
'try_stmt': pr.TryStmt,
'comp_for': pr.CompFor,
}
self._ast_mapping = dict((getattr(pytree.python_symbols, k), v)
for k, v in _ast_mapping.items())
self.global_names = []
#if self.options["print_function"]:
# python_grammar = pygram.python_grammar_no_print_statement
#else:
# When this is True, the refactor*() methods will call write_file() for
# files processed even if they were not changed during refactoring. If
# and only if the refactor method's write parameter was True.
self.used_names = {}
self.scope_names_stack = [{}]
logger = logging.getLogger("Jedi-Parser")
d = Driver(pytree.python_grammar, self.convert_node, self.convert_leaf,
self.error_recovery, logger=logger)
self.module = d.parse_string(source).get_parent_until()
self.module.used_names = self.used_names
self.module.set_global_names(self.global_names)
def convert_node(self, grammar, type, children):
"""
Convert raw node information to a Node instance.
This is passed to the parser driver which calls it whenever a reduction of a
grammar rule produces a new complete node, so that the tree is build
strictly bottom-up.
"""
#print(type, children, pytree.type_repr(type))
try:
new_node = self._ast_mapping[type](children)
except KeyError:
new_node = pr.Node(type, children)
# We need to check raw_node always, because the same node can be
# returned by convert multiple times.
if type == pytree.python_symbols.global_stmt:
self.global_names += new_node.names()
elif isinstance(new_node, (pr.ClassOrFunc, pr.Module)) \
and type in (pytree.python_symbols.funcdef,
pytree.python_symbols.classdef,
pytree.python_symbols.file_input):
# scope_name_stack handling
scope_names = self.scope_names_stack.pop()
if isinstance(new_node, pr.ClassOrFunc):
n = new_node.name
scope_names[n.value].remove(n)
# Set the func name of the current node
arr = self.scope_names_stack[-1].setdefault(n.value, [])
arr.append(n)
new_node.names_dict = scope_names
elif isinstance(new_node, pr.CompFor):
# The name definitions of comprehenions shouldn't be part of the
# current scope. They are part of the comprehension scope.
for n in new_node.get_defined_names():
self.scope_names_stack[-1][n.value].remove(n)
return new_node
def convert_leaf(self, grammar, type, value, prefix, start_pos):
#print('leaf', value, pytree.type_repr(type))
if type == tokenize.NAME:
if value in grammar.keywords:
if value in ('def', 'class'):
self.scope_names_stack.append({})
return pr.Keyword(value, start_pos, prefix)
else:
name = pr.Name(value, start_pos, prefix)
# Keep a listing of all used names
arr = self.used_names.setdefault(name.value, [])
arr.append(name)
arr = self.scope_names_stack[-1].setdefault(name.value, [])
arr.append(name)
return name
elif type in (tokenize.STRING, tokenize.NUMBER):
return pr.Literal(value, start_pos, prefix)
elif type in (tokenize.NEWLINE, tokenize.ENDMARKER):
return pr.Whitespace(value, start_pos, prefix)
else:
return pr.Operator(value, start_pos, prefix)
def error_recovery(self, grammar, stack, type, value):
"""
This parser is written in a dynamic way, meaning that this parser
allows using different grammars (even non-Python). However, error
recovery is purely written for Python.
"""
# For now just discard everything that is not a suite or
# file_input, if we detect an error.
for i, (dfa, state, (type, _)) in reversed(list(enumerate(stack))):
# `suite` can sometimes be only simple_stmt, not stmt.
if type in (grammar.symbol2number['file_input'],
grammar.symbol2number['suite']):
index = i
break
self._stack_removal(stack, index + 1)
# No success finding a transition
#raise ParseError("bad input", type, value, context)
def _stack_removal(self, stack, start_index):
def clear_names(children):
for c in children:
try:
clear_names(c.children)
except AttributeError:
if isinstance(c, pr.Name):
try:
self.scope_names_stack[-1][c.value].remove(c)
self.used_names[c.value].remove(c)
except ValueError:
pass # This may happen with CompFor.
for dfa, state, node in stack[start_index:]:
clear_names(children=node[1])
stack[start_index:] = []
def __init__old__(self, source, module_path=None, no_docstr=False,
tokenizer=None, top_module=None):
self.no_docstr = no_docstr
tokenizer = tokenizer or tokenize.source_tokens(source)
self._gen = PushBackTokenizer(tokenizer)
# initialize global Scope
start_pos = next(self._gen).start_pos
self._gen.push_last_back()
self.module = pr.SubModule(module_path, start_pos, top_module)
self._scope = self.module
self._top_module = top_module or self.module
try:
self._parse()
except (common.MultiLevelStopIteration, StopIteration):
# StopIteration needs to be added as well, because python 2 has a
# strange way of handling StopIterations.
# sometimes StopIteration isn't catched. Just ignore it.
# on finish, set end_pos correctly
pass
s = self._scope
while s is not None:
s.end_pos = self._gen.current.end_pos
s = s.parent
# clean up unused decorators
for d in self._decorators:
# set a parent for unused decorators, avoid NullPointerException
# because of `self.module.used_names`.
d.parent = self.module
self.module.end_pos = self._gen.current.end_pos
if self._gen.current.type == tokenize.NEWLINE:
# This case is only relevant with the FastTokenizer, because
# otherwise there's always an ENDMARKER.
# we added a newline before, so we need to "remove" it again.
#
# NOTE: It should be keep end_pos as-is if the last token of
# a source is a NEWLINE, otherwise the newline at the end of
# a source is not included in a ParserNode.code.
if self._gen.previous.type != tokenize.NEWLINE:
self.module.end_pos = self._gen.previous.end_pos
del self._gen
def __repr__(self):
return "<%s: %s>" % (type(self).__name__, self.module)
def _check_user_stmt(self, simple):
# this is not user checking, just update the used_names
for tok_name in self.module.temp_used_names:
try:
self.module.used_names[tok_name].add(simple)
except KeyError:
self.module.used_names[tok_name] = set([simple])
self.module.temp_used_names = []
if isinstance(simple, pr.Statement):
for name, calls in simple.get_names_dict().items():
self._scope.add_name_calls(name, calls)
def _parse_dotted_name(self, pre_used_token=None):
"""
The dot name parser parses a name, variable or function and returns
their names.
Just used for parsing imports.
:return: tuple of Name, next_token
"""
def append(tok):
names.append(pr.Name(self.module, tok.string, None, tok.start_pos))
self.module.temp_used_names.append(tok.string)
names = []
tok = next(self._gen) if pre_used_token is None else pre_used_token
if tok.type != tokenize.NAME and tok.string != '*':
return [], tok
append(tok)
while True:
tok = next(self._gen)
if tok.string != '.':
break
tok = next(self._gen)
if tok.type != tokenize.NAME:
break
append(tok)
return names, tok
def _parse_name(self, pre_used_token=None):
tok = next(self._gen) if pre_used_token is None else pre_used_token
self.module.temp_used_names.append(tok.string)
if tok.type != tokenize.NAME:
return None, tok
return pr.Name(self.module, tok.string, None, tok.start_pos), next(self._gen)
def _parse_import_list(self):
"""
The parser for the imports. Unlike the class and function parse
function, this returns no Import class, but rather an import list,
which is then added later on.
The reason, why this is not done in the same class lies in the nature
of imports. There are two ways to write them:
- from ... import ...
- import ...
To distinguish, this has to be processed after the parser.
:return: List of imports.
:rtype: list
"""
imports = []
brackets = False
continue_kw = [",", ";", "\n", '\r\n', ')'] \
+ list(set(keyword.kwlist) - set(['as']))
while True:
defunct = False
tok = next(self._gen)
if tok.string == '(': # python allows only one `(` in the statement.
brackets = True
tok = next(self._gen)
if brackets and tok.type == tokenize.NEWLINE:
tok = next(self._gen)
names, tok = self._parse_dotted_name(tok)
if not names:
defunct = True
alias = None
if tok.string == 'as':
alias, tok = self._parse_name()
imports.append((names, alias, defunct))
while tok.string not in continue_kw:
tok = next(self._gen)
if not (tok.string == "," or brackets and tok.type == tokenize.NEWLINE):
break
return imports
def _parse_parentheses(self, is_class):
"""
Functions and Classes have params (which means for classes
super-classes). They are parsed here and returned as Statements.
:return: List of Statements
:rtype: list
"""
params = []
tok = None
pos = 0
breaks = [',', ':']
while tok is None or tok.string not in (')', ':'):
# Classes don't have params, a Class works more like a function
# call.
param, tok = self._parse_statement(added_breaks=breaks,
stmt_class=pr.ExprStmt
if is_class else pr.Param)
if is_class:
if param is not None:
params.append(param)
else:
if param is not None and tok.string == ':':
# parse annotations
annotation, tok = self._parse_statement(added_breaks=breaks)
if annotation:
param.add_annotation(annotation)
# Function params without vars are usually syntax errors.
# expressions are valid in superclass declarations.
if param is not None and param.get_defined_names():
param.position_nr = pos
params.append(param)
pos += 1
return params
def _parse_function(self):
"""
The parser for a text functions. Process the tokens, which follow a
function definition.
:return: Return a Scope representation of the tokens.
:rtype: Function
"""
first_pos = self._gen.current.start_pos
tok = next(self._gen)
if tok.type != tokenize.NAME:
return None
fname, tok = self._parse_name(tok)
if tok.string != '(':
return None
params = self._parse_parentheses(is_class=False)
colon = next(self._gen)
annotation = None
if colon.string in ('-', '->'):
# parse annotations
if colon.string == '-':
# The Python 2 tokenizer doesn't understand this
colon = next(self._gen)
if colon.string != '>':
return None
annotation, colon = self._parse_statement(added_breaks=[':'])
if colon.string != ':':
return None
# Because of 2 line func param definitions
return pr.Function(self.module, fname, params, first_pos, annotation)
def _parse_class(self):
"""
The parser for a text class. Process the tokens, which follow a
class definition.
:return: Return a Scope representation of the tokens.
:rtype: Class
"""
first_pos = self._gen.current.start_pos
cname = next(self._gen)
if cname.type != tokenize.NAME:
debug.warning("class: syntax err, token is not a name@%s (%s: %s)",
cname.start_pos[0], tokenize.tok_name[cname.type], cname.string)
return None
cname, _next = self._parse_name(cname)
superclasses = []
if _next.string == '(':
superclasses = self._parse_parentheses(is_class=True)
_next = next(self._gen)
if _next.string != ':':
debug.warning("class syntax: %s@%s", cname, _next.start_pos[0])
return None
return pr.Class(self.module, cname, superclasses, first_pos)
def _parse_statement(self, pre_used_token=None, added_breaks=None,
stmt_class=pr.ExprStmt, names_are_set_vars=False,
maybe_docstr=False):
"""
Parses statements like::
a = test(b)
a += 3 - 2 or b
and so on. One line at a time.
:param pre_used_token: The pre parsed token.
:type pre_used_token: set
:return: ExprStmt + last parsed token.
:rtype: (ExprStmt, str)
"""
set_vars = []
level = 0 # The level of parentheses
if pre_used_token:
tok = pre_used_token
else:
tok = next(self._gen)
while tok.type == tokenize.COMMENT:
# remove newline and comment
next(self._gen)
tok = next(self._gen)
first_pos = tok.start_pos
opening_brackets = ['{', '(', '[']
closing_brackets = ['}', ')', ']']
# the difference between "break" and "always break" is that the latter
# will even break in parentheses. This is true for typical flow
# commands like def and class and the imports, which will never be used
# in a statement.
breaks = set(['\n', '\r\n', ':', ')'])
always_break = [';', 'import', 'from', 'class', 'def', 'try', 'except',
'finally', 'while', 'return', 'yield']
not_first_break = ['del', 'raise']
if added_breaks:
breaks |= set(added_breaks)
tok_list = []
as_names = []
in_lambda_param = False
while not (tok.string in always_break
or tok.string in not_first_break and not tok_list
or tok.string in breaks and level <= 0
and not (in_lambda_param and tok.string in ',:')):
try:
is_kw = tok.string in OPERATOR_KEYWORDS
if tok.type == tokenize.OP or is_kw:
tok_list.append(
pr.Operator(self.module, tok.string, self._scope, tok.start_pos)
)
else:
tok_list.append(tok)
if tok.string == 'as':
tok = next(self._gen)
if tok.type == tokenize.NAME:
n, tok = self._parse_name(self._gen.current)
if n:
set_vars.append(n)
as_names.append(n)
tok_list.append(n)
continue
elif tok.string == 'lambda':
breaks.discard(':')
in_lambda_param = True
elif in_lambda_param and tok.string == ':':
in_lambda_param = False
elif tok.type == tokenize.NAME and not is_kw:
tok_list[-1], tok = self._parse_name(tok)
continue
elif tok.string in opening_brackets:
level += 1
elif tok.string in closing_brackets:
level -= 1
tok = next(self._gen)
except (StopIteration, common.MultiLevelStopIteration):
# comes from tokenizer
break
if not tok_list:
return None, tok
first_tok = tok_list[0]
# docstrings
if len(tok_list) == 1 and isinstance(first_tok, tokenize.Token) \
and first_tok.type == tokenize.STRING and maybe_docstr:
# Normal docstring check
if self.freshscope and not self.no_docstr:
self._scope.add_docstr(first_tok)
return None, tok
# Attribute docstring (PEP 224) support (sphinx uses it, e.g.)
# If string literal is being parsed...
else:
with common.ignored(IndexError, AttributeError):
# ...then set it as a docstring
self._scope.statements[-1].add_docstr(first_tok)
return None, tok
stmt = stmt_class(self.module, tok_list, first_pos, tok.end_pos,
as_names=as_names,
names_are_set_vars=names_are_set_vars)
stmt.parent = self._top_module
self._check_user_stmt(stmt)
if tok.string in always_break + not_first_break:
self._gen.push_last_back()
return stmt, tok
def _parse(self):
"""
The main part of the program. It analyzes the given code-text and
returns a tree-like scope. For a more detailed description, see the
class description.
:param text: The code which should be parsed.
:param type: str
:raises: IndentationError
"""
extended_flow = ['else', 'elif', 'except', 'finally']
statement_toks = ['{', '[', '(', '`']
self._decorators = []
self.freshscope = True
for tok in self._gen:
token_type = tok.type
tok_str = tok.string
first_pos = tok.start_pos
self.module.temp_used_names = []
# debug.dbg('main: tok=[%s] type=[%s] indent=[%s]', \
# tok, tokenize.tok_name[token_type], start_position[0])
# check again for unindented stuff. this is true for syntax
# errors. only check for names, because thats relevant here. If
# some docstrings are not indented, I don't care.
while first_pos[1] <= self._scope.start_pos[1] \
and (token_type == tokenize.NAME or tok_str in ('(', '['))\
and self._scope != self.module:
self._scope.end_pos = first_pos
self._scope = self._scope.parent
if isinstance(self._scope, pr.Module) \
and not isinstance(self._scope, pr.SubModule):
self._scope = self.module
if isinstance(self._scope, pr.SubModule):
use_as_parent_scope = self._top_module
else:
use_as_parent_scope = self._scope
if tok_str == 'def':
func = self._parse_function()
if func is None:
debug.warning("function: syntax error@%s", first_pos[0])
continue
self.freshscope = True
self._scope = self._scope.add_scope(func, self._decorators)
self._decorators = []
elif tok_str == 'class':
cls = self._parse_class()
if cls is None:
debug.warning("class: syntax error@%s" % first_pos[0])
continue
self.freshscope = True
self._scope = self._scope.add_scope(cls, self._decorators)
self._decorators = []
# import stuff
elif tok_str == 'import':
imports = self._parse_import_list()
for count, (names, alias, defunct) in enumerate(imports):
e = (alias or names and names[-1] or self._gen.previous).end_pos
end_pos = self._gen.previous.end_pos if count + 1 == len(imports) else e
i = pr.Import(self.module, first_pos, end_pos, names,
alias, defunct=defunct)
self._check_user_stmt(i)
self._scope.add_import(i)
if not imports:
i = pr.Import(self.module, first_pos, self._gen.current.end_pos,
None, defunct=True)
self._check_user_stmt(i)
self.freshscope = False
elif tok_str == 'from':
defunct = False
# take care for relative imports
relative_count = 0
while True:
tok = next(self._gen)
if tok.string != '.':
break
relative_count += 1
# the from import
from_names, tok = self._parse_dotted_name(self._gen.current)
tok_str = tok.string
if len(from_names) == 1 and str(from_names[0]) == 'import' and relative_count:
self._gen.push_last_back()
tok_str = 'import'
from_names = []
if not from_names and not relative_count or tok_str != "import":
debug.warning("from: syntax error@%s", tok.start_pos[0])
defunct = True
if tok_str != 'import':
self._gen.push_last_back()
imports = self._parse_import_list()
for count, (names, alias, defunct2) in enumerate(imports):
star = names and unicode(names[-1]) == '*'
if star:
names = []
e = (alias or names and names[-1] or self._gen.previous).end_pos
#end_pos = self._gen.previous.end_pos if count + 1 == len(names) else e
i = pr.Import(self.module, first_pos, e, names,
alias, from_names, star, relative_count,
defunct=defunct or defunct2)
self._check_user_stmt(i)
self._scope.add_import(i)
self.freshscope = False
# loops
elif tok_str == 'for':
set_stmt, tok = self._parse_statement(added_breaks=['in'],
names_are_set_vars=True)
if tok.string != 'in':
debug.warning('syntax err, for flow incomplete @%s', tok.start_pos[0])
try:
statement, tok = self._parse_statement()
except StopIteration:
statement, tok = None, None
s = [] if statement is None else [statement]
f = pr.ForFlow(self.module, s, first_pos, set_stmt)
self._scope = self._scope.add_statement(f)
if tok is None or tok.string != ':':
debug.warning('syntax err, for flow started @%s', first_pos[0])
elif tok_str in ['if', 'while', 'try', 'with'] + extended_flow:
added_breaks = []
command = tok_str
if command in ('except', 'with'):
added_breaks.append(',')
# multiple inputs because of with
inputs = []
first = True
while first or command == 'with' and tok.string not in (':', '\n', '\r\n'):
statement, tok = \
self._parse_statement(added_breaks=added_breaks)
if command == 'except' and tok.string == ',':
# the except statement defines a var
# this is only true for python 2
n, tok = self._parse_name()
if n:
n.parent = statement
statement.as_names.append(n)
if statement:
inputs.append(statement)
first = False
f = pr.Flow(self.module, command, inputs, first_pos)
if command in extended_flow:
# The last statement has to be another part of the flow
# statement, because a dedent releases the main scope, so
# just take the last statement.
try:
s = self._scope.statements[-1].set_next(f)
except (AttributeError, IndexError):
# If set_next doesn't exist, just add it.
s = self._scope.add_statement(f)
else:
s = self._scope.add_statement(f)
self._scope = s
if tok.string != ':':
debug.warning('syntax err, flow started @%s', tok.start_pos[0])
# returns
elif tok_str in ('return', 'yield'):
s = tok.start_pos
self.freshscope = False
# Add returns to the scope
# Should be a function, otherwise just add it to a module!
func = self._scope.get_parent_until((pr.Function, pr.Module))
if tok_str == 'yield':
func.is_generator = True
stmt, tok = self._parse_statement()
if stmt is not None:
stmt.parent = use_as_parent_scope
try:
kw_stmt = pr.KeywordStatement(tok_str, s,
use_as_parent_scope, stmt)
self._scope.statements.append(kw_stmt)
func.returns.append(kw_stmt)
# start_pos is the one of the return statement
stmt.start_pos = s
except AttributeError:
debug.warning('return in non-function')
stmt = None
elif tok_str == 'assert':
stmt, tok = self._parse_statement()
if stmt is not None:
stmt.parent = use_as_parent_scope
self._scope.statements.append(stmt)
self._scope.asserts.append(stmt)
elif tok_str in STATEMENT_KEYWORDS:
stmt, _ = self._parse_statement()
kw = pr.KeywordStatement(tok_str, tok.start_pos,
use_as_parent_scope, stmt)
self._scope.add_statement(kw)
if stmt is not None and tok_str == 'global':
for t in stmt._token_list:
if isinstance(t, pr.Name):
# Add the global to the top module, it counts there.
self.module.add_global(t)
# decorator
elif tok_str == '@':
stmt, tok = self._parse_statement()
if stmt is not None:
self._decorators.append(stmt)
elif tok_str == 'pass':
continue
# default
elif token_type in (tokenize.NAME, tokenize.STRING,
tokenize.NUMBER, tokenize.OP) \
or tok_str in statement_toks:
# this is the main part - a name can be a function or a
# normal var, which can follow anything. but this is done
# by the statement parser.
stmt, tok = self._parse_statement(self._gen.current,
maybe_docstr=True)
if stmt:
self._scope.add_statement(stmt)
self.freshscope = False
else:
if token_type not in (tokenize.COMMENT, tokenize.NEWLINE, tokenize.ENDMARKER):
debug.warning('Token not used: %s %s %s', tok_str,
tokenize.tok_name[token_type], first_pos)
continue
self.no_docstr = False
class PushBackTokenizer(object):
def __init__(self, tokenizer):
self._tokenizer = tokenizer
self._push_backs = []
self.current = self.previous = tokenize.Token(None, '', (0, 0))
def push_last_back(self):
self._push_backs.append(self.current)
def next(self):
""" Python 2 Compatibility """
return self.__next__()
def __next__(self):
if self._push_backs:
return self._push_backs.pop(0)
previous = self.current
self.current = next(self._tokenizer)
self.previous = previous
return self.current
def __iter__(self):
return self