jedi-fork/jedi/parser/__init__.py

"""
The ``Parser`` tries to convert the available Python code in an easy to read
format, something like an abstract syntax tree. The classes who represent this
tree, are sitting in the :mod:`jedi.parser.representation` module.

The Python module ``tokenize`` is a very important part in the ``Parser``,
because it splits the code into different words (tokens).  Sometimes it looks a
bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast``
module for this? Well, ``ast`` does a very good job understanding proper Python
code, but fails to work as soon as there's a single line of broken code.

There's one important optimization that needs to be known: Statements are not
being parsed completely. ``Statement`` is just a representation of the tokens
within the statement. This lowers memory usage and cpu time and reduces the
complexity of the ``Parser`` (there's another parser sitting inside
``Statement``, which produces ``Array`` and ``Call``).
"""
import keyword
import logging

from jedi._compatibility import next, unicode
from jedi import debug
from jedi import common
from jedi.parser import representation as pr
from jedi.parser import tokenize
from jedi.parser import pytree
from jedi.parser.pgen2 import Driver
from jedi.parser import pgen2

OPERATOR_KEYWORDS = 'and', 'for', 'if', 'else', 'in', 'is', 'lambda', 'not', 'or'
# Not used yet. In the future I intend to add something like KeywordStatement
STATEMENT_KEYWORDS = 'assert', 'del', 'global', 'nonlocal', 'raise', \
    'return', 'yield', 'pass', 'continue', 'break'


class Parser(object):
    """
    This class is used to parse a Python file, it then divides them into a
    class structure of different scopes.

    :param source: The codebase for the parser.
    :type source: str
    :param module_path: The path of the module in the file system, may be None.
    :type module_path: str
    :param no_docstr: If True, a string at the beginning is not a docstr.
    :param top_module: Use this module as a parent instead of `self.module`.
    """
    def __init__(self, source, module_path=None, no_docstr=False,
                 tokenizer=None, top_module=None):

        if not source.endswith('\n'):
            source += '\n'

        _ast_mapping = {
            'expr_stmt': pr.ExprStmt,
            'classdef': pr.Class,
            'funcdef': pr.Function,
            'file_input': pr.SubModule,
            'import_name': pr.ImportName,
            'import_from': pr.ImportFrom,
            'break_stmt': pr.KeywordStatement,
            'continue_stmt': pr.KeywordStatement,
            'return_stmt': pr.ReturnStmt,
            'raise_stmt': pr.KeywordStatement,
            'yield_expr': pr.YieldExpr,
            'del_stmt': pr.KeywordStatement,
            'pass_stmt': pr.KeywordStatement,
            'global_stmt': pr.GlobalStmt,
            'nonlocal_stmt': pr.KeywordStatement,
            'assert_stmt': pr.KeywordStatement,
            'if_stmt': pr.IfStmt,
            'with_stmt': pr.WithStmt,
            'for_stmt': pr.ForStmt,
            'while_stmt': pr.WhileStmt,
            'try_stmt': pr.TryStmt,
            'comp_for': pr.CompFor,
            'decorator': pr.Decorator,
        }

        self._ast_mapping = dict((getattr(pytree.python_symbols, k), v)
                                 for k, v in _ast_mapping.items())

        self.global_names = []
        #if self.options["print_function"]:
        #    python_grammar = pygram.python_grammar_no_print_statement
        #else:
        # When this is True, the refactor*() methods will call write_file() for
        # files processed even if they were not changed during refactoring. If
        # and only if the refactor method's write parameter was True.
        self.used_names = {}
        self.scope_names_stack = [{}]
        logger = logging.getLogger("Jedi-Parser")
        d = Driver(pytree.python_grammar, self.convert_node, self.convert_leaf,
                   self.error_recovery, logger=logger)
        self.module = d.parse_string(source).get_parent_until()

        self.module.used_names = self.used_names
        self.module.path = module_path
        self.module.set_global_names(self.global_names)

    def convert_node(self, grammar, type, children):
        """
        Convert raw node information to a Node instance.

        This is passed to the parser driver which calls it whenever a reduction of a
        grammar rule produces a new complete node, so that the tree is build
        strictly bottom-up.
        """
        #print(type, children, pytree.type_repr(type))
        try:
            new_node = self._ast_mapping[type](children)
        except KeyError:
            new_node = pr.Node(type, children)

        # We need to check raw_node always, because the same node can be
        # returned by convert multiple times.
        if type == pytree.python_symbols.global_stmt:
            self.global_names += new_node.get_defined_names()
        elif isinstance(new_node, (pr.ClassOrFunc, pr.Module)) \
                and type in (pytree.python_symbols.funcdef,
                             pytree.python_symbols.classdef,
                             pytree.python_symbols.file_input):
            # scope_name_stack handling
            scope_names = self.scope_names_stack.pop()
            if isinstance(new_node, pr.ClassOrFunc):
                n = new_node.name
                scope_names[n.value].remove(n)
                # Set the func name of the current node
                arr = self.scope_names_stack[-1].setdefault(n.value, [])
                arr.append(n)
            new_node.names_dict = scope_names
        elif isinstance(new_node, pr.CompFor):
            # The name definitions of comprehenions shouldn't be part of the
            # current scope. They are part of the comprehension scope.
            for n in new_node.get_defined_names():
                self.scope_names_stack[-1][n.value].remove(n)
        return new_node

    def convert_leaf(self, grammar, type, value, prefix, start_pos):
        #print('leaf', value, pytree.type_repr(type))
        if type == tokenize.NAME:
            if value in grammar.keywords:
                if value in ('def', 'class'):
                    self.scope_names_stack.append({})

                return pr.Keyword(value, start_pos, prefix)
            else:
                name = pr.Name(value, start_pos, prefix)
                # Keep a listing of all used names
                arr = self.used_names.setdefault(name.value, [])
                arr.append(name)
                arr = self.scope_names_stack[-1].setdefault(name.value, [])
                arr.append(name)
                return name
        elif type == tokenize.STRING:
            return pr.String(value, start_pos, prefix)
        elif type == tokenize.NUMBER:
            return pr.Number(value, start_pos, prefix)
        elif type in (tokenize.NEWLINE, tokenize.ENDMARKER):
            return pr.Whitespace(value, start_pos, prefix)
        else:
            return pr.Operator(value, start_pos, prefix)

    def error_recovery(self, grammar, stack, type, value):
        """
        This parser is written in a dynamic way, meaning that this parser
        allows using different grammars (even non-Python). However, error
        recovery is purely written for Python.
        """
        # For now just discard everything that is not a suite or
        # file_input, if we detect an error.
        for i, (dfa, state, (type, _)) in reversed(list(enumerate(stack))):
            # `suite` can sometimes be only simple_stmt, not stmt.
            if type in (grammar.symbol2number['file_input'],
                        grammar.symbol2number['suite']):
                index = i
                break
        self._stack_removal(stack, index + 1)
        # No success finding a transition
        #raise ParseError("bad input", type, value, context)

    def _stack_removal(self, stack, start_index):
        def clear_names(children):
            for c in children:
                try:
                    clear_names(c.children)
                except AttributeError:
                    if isinstance(c, pr.Name):
                        try:
                            self.scope_names_stack[-1][c.value].remove(c)
                            self.used_names[c.value].remove(c)
                        except ValueError:
                            pass  # This may happen with CompFor.

        for dfa, state, node in stack[start_index:]:
            clear_names(children=node[1])

        stack[start_index:] = []


    def __init__old__(self, source, module_path=None, no_docstr=False,
                      tokenizer=None, top_module=None):
        self.no_docstr = no_docstr

        tokenizer = tokenizer or tokenize.source_tokens(source)
        self._gen = PushBackTokenizer(tokenizer)

        # initialize global Scope
        start_pos = next(self._gen).start_pos
        self._gen.push_last_back()
        self.module = pr.SubModule(module_path, start_pos, top_module)
        self._scope = self.module
        self._top_module = top_module or self.module

        try:
            self._parse()
        except (common.MultiLevelStopIteration, StopIteration):
            # StopIteration needs to be added as well, because python 2 has a
            # strange way of handling StopIterations.
            # sometimes StopIteration isn't catched. Just ignore it.

            # on finish, set end_pos correctly
            pass
        s = self._scope
        while s is not None:
            s.end_pos = self._gen.current.end_pos
            s = s.parent

        # clean up unused decorators
        for d in self._decorators:
            # set a parent for unused decorators, avoid NullPointerException
            # because of `self.module.used_names`.
            d.parent = self.module

        self.module.end_pos = self._gen.current.end_pos
        if self._gen.current.type == tokenize.NEWLINE:
            # This case is only relevant with the FastTokenizer, because
            # otherwise there's always an ENDMARKER.
            # we added a newline before, so we need to "remove" it again.
            #
            # NOTE: It should be keep end_pos as-is if the last token of
            # a source is a NEWLINE, otherwise the newline at the end of
            # a source is not included in a ParserNode.code.
            if self._gen.previous.type != tokenize.NEWLINE:
                self.module.end_pos = self._gen.previous.end_pos

        del self._gen

    def __repr__(self):
        return "<%s: %s>" % (type(self).__name__, self.module)

    def _check_user_stmt(self, simple):
        # this is not user checking, just update the used_names
        for tok_name in self.module.temp_used_names:
            try:
                self.module.used_names[tok_name].add(simple)
            except KeyError:
                self.module.used_names[tok_name] = set([simple])
        self.module.temp_used_names = []
        if isinstance(simple, pr.Statement):
            for name, calls in simple.get_names_dict().items():
                self._scope.add_name_calls(name, calls)


class PushBackTokenizer(object):
    def __init__(self, tokenizer):
        self._tokenizer = tokenizer
        self._push_backs = []
        self.current = self.previous = tokenize.Token(None, '', (0, 0))

    def push_last_back(self):
        self._push_backs.append(self.current)

    def next(self):
        """ Python 2 Compatibility """
        return self.__next__()

    def __next__(self):
        if self._push_backs:
            return self._push_backs.pop(0)

        previous = self.current
        self.current = next(self._tokenizer)
        self.previous = previous
        return self.current

    def __iter__(self):
        return self