jedi-fork/jedi/parser/python/parser.py

from jedi.parser.python import tree
from jedi.parser import tokenize
from jedi.parser.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER,
                               STRING, tok_name)
from jedi.parser.parser import BaseParser
from jedi.common import splitlines


class Parser(BaseParser):
    """
    This class is used to parse a Python file, it then divides them into a
    class structure of different scopes.

    :param grammar: The grammar object of pgen2. Loaded by load_grammar.
    """

    node_map = {
        'expr_stmt': tree.ExprStmt,
        'classdef': tree.Class,
        'funcdef': tree.Function,
        'file_input': tree.Module,
        'import_name': tree.ImportName,
        'import_from': tree.ImportFrom,
        'break_stmt': tree.KeywordStatement,
        'continue_stmt': tree.KeywordStatement,
        'return_stmt': tree.ReturnStmt,
        'raise_stmt': tree.KeywordStatement,
        'yield_expr': tree.YieldExpr,
        'del_stmt': tree.KeywordStatement,
        'pass_stmt': tree.KeywordStatement,
        'global_stmt': tree.GlobalStmt,
        'nonlocal_stmt': tree.KeywordStatement,
        'print_stmt': tree.KeywordStatement,
        'assert_stmt': tree.AssertStmt,
        'if_stmt': tree.IfStmt,
        'with_stmt': tree.WithStmt,
        'for_stmt': tree.ForStmt,
        'while_stmt': tree.WhileStmt,
        'try_stmt': tree.TryStmt,
        'comp_for': tree.CompFor,
        'decorator': tree.Decorator,
        'lambdef': tree.Lambda,
        'old_lambdef': tree.Lambda,
        'lambdef_nocond': tree.Lambda,
    }
    default_node = tree.PythonNode

    def __init__(self, grammar, error_recovery=True, start_symbol='file_input'):
        super(Parser, self).__init__(grammar, start_symbol, error_recovery=error_recovery)

        self.syntax_errors = []
        self._omit_dedent_list = []
        self._indent_counter = 0

        # TODO do print absolute import detection here.
        # try:
        #     del python_grammar_no_print_statement.keywords["print"]
        # except KeyError:
        #     pass  # Doesn't exist in the Python 3 grammar.

        # if self.options["print_function"]:
        #     python_grammar = pygram.python_grammar_no_print_statement
        # else:

    def parse(self, tokens):
        if self._error_recovery:
            if self._start_symbol != 'file_input':
                raise NotImplementedError

            tokens = self._recovery_tokenize(tokens)

        node = super(Parser, self).parse(tokens)

        if self._start_symbol == 'file_input' != node.type:
            # If there's only one statement, we get back a non-module. That's
            # not what we want, we want a module, so we add it here:
            node = self.convert_node(
                self._grammar,
                self._grammar.symbol2number['file_input'],
                [node]
            )

        return node

    def convert_node(self, grammar, type, children):
        """
        Convert raw node information to a PythonBaseNode instance.

        This is passed to the parser driver which calls it whenever a reduction of a
        grammar rule produces a new complete node, so that the tree is build
        strictly bottom-up.
        """
        # TODO REMOVE symbol, we don't want type here.
        symbol = grammar.number2symbol[type]
        try:
            return self.node_map[symbol](children)
        except KeyError:
            if symbol == 'suite':
                # We don't want the INDENT/DEDENT in our parser tree. Those
                # leaves are just cancer. They are virtual leaves and not real
                # ones and therefore have pseudo start/end positions and no
                # prefixes. Just ignore them.
                children = [children[0]] + children[2:-1]
            return self.default_node(symbol, children)

    def convert_leaf(self, grammar, type, value, prefix, start_pos):
        # print('leaf', repr(value), token.tok_name[type])
        if type == tokenize.NAME:
            if value in grammar.keywords:
                return tree.Keyword(value, start_pos, prefix)
            else:
                return tree.Name(value, start_pos, prefix)
        elif type == STRING:
            return tree.String(value, start_pos, prefix)
        elif type == NUMBER:
            return tree.Number(value, start_pos, prefix)
        elif type == NEWLINE:
            return tree.Newline(value, start_pos, prefix)
        elif type == ENDMARKER:
            return tree.EndMarker(value, start_pos, prefix)
        else:
            return tree.Operator(value, start_pos, prefix)

    def error_recovery(self, grammar, stack, arcs, typ, value, start_pos, prefix,
                       add_token_callback):
        """
        This parser is written in a dynamic way, meaning that this parser
        allows using different grammars (even non-Python). However, error
        recovery is purely written for Python.
        """
        if not self._error_recovery:
            return super(Parser, self).error_recovery(
                grammar, stack, arcs, typ, value, start_pos, prefix,
                add_token_callback)

        def current_suite(stack):
            # For now just discard everything that is not a suite or
            # file_input, if we detect an error.
            for index, (dfa, state, (type_, nodes)) in reversed(list(enumerate(stack))):
                # `suite` can sometimes be only simple_stmt, not stmt.
                symbol = grammar.number2symbol[type_]
                if symbol == 'file_input':
                    break
                elif symbol == 'suite' and len(nodes) > 1:
                    # suites without an indent in them get discarded.
                    break
                elif symbol == 'simple_stmt' and len(nodes) > 1:
                    # simple_stmt can just be turned into a PythonNode, if
                    # there are enough statements. Ignore the rest after that.
                    break
            return index, symbol, nodes

        index, symbol, nodes = current_suite(stack)
        if symbol == 'simple_stmt':
            index -= 2
            (_, _, (type_, suite_nodes)) = stack[index]
            symbol = grammar.number2symbol[type_]
            suite_nodes.append(tree.PythonNode(symbol, list(nodes)))
            # Remove
            nodes[:] = []
            nodes = suite_nodes
            stack[index]

        # print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
        if self._stack_removal(grammar, stack, arcs, index + 1, value, start_pos):
            add_token_callback(typ, value, start_pos, prefix)
        else:
            if typ == INDENT:
                # For every deleted INDENT we have to delete a DEDENT as well.
                # Otherwise the parser will get into trouble and DEDENT too early.
                self._omit_dedent_list.append(self._indent_counter)
            else:
                error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix)
                stack[-1][2][1].append(error_leaf)

    def _stack_removal(self, grammar, stack, arcs, start_index, value, start_pos):
        failed_stack = []
        found = False
        all_nodes = []
        for dfa, state, (typ, nodes) in stack[start_index:]:
            if nodes:
                found = True
            if found:
                symbol = grammar.number2symbol[typ]
                failed_stack.append((symbol, nodes))
                all_nodes += nodes
        if failed_stack:
            stack[start_index - 1][2][1].append(tree.PythonErrorNode(all_nodes))

        stack[start_index:] = []
        return failed_stack

    def _recovery_tokenize(self, tokens):
        for typ, value, start_pos, prefix in tokens:
            # print(tokenize.tok_name[typ], repr(value), start_pos, repr(prefix))
            if typ == DEDENT:
                # We need to count indents, because if we just omit any DEDENT,
                # we might omit them in the wrong place.
                o = self._omit_dedent_list
                if o and o[-1] == self._indent_counter:
                    o.pop()
                    continue

                self._indent_counter -= 1
            elif typ == INDENT:
                self._indent_counter += 1

            yield typ, value, start_pos, prefix


def _remove_last_newline(node):
    endmarker = node.children[-1]
    # The newline is either in the endmarker as a prefix or the previous
    # leaf as a newline token.
    prefix = endmarker.prefix
    leaf = endmarker.get_previous_leaf()
    if prefix:
        text = prefix
    else:
        if leaf is None:
            raise ValueError("You're trying to remove a newline from an empty module.")

        text = leaf.value

    if not text.endswith('\n'):
        raise ValueError("There's no newline at the end, cannot remove it.")

    text = text[:-1]
    if prefix:
        endmarker.prefix = text

        if leaf is None:
            end_pos = (1, 0)
        else:
            end_pos = leaf.end_pos

        lines = splitlines(text, keepends=True)
        if len(lines) == 1:
            end_pos = end_pos[0], end_pos[1] + len(lines[0])
        else:
            end_pos = end_pos[0] + len(lines) - 1,  len(lines[-1])
        endmarker.start_pos = end_pos
    else:
        leaf.value = text
        endmarker.start_pos = leaf.end_pos