parso/parso/python/parser.py

from parso.python import tree
from parso.python.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER,
                                STRING, tok_name, NAME, FSTRING_STRING,
                                FSTRING_START, FSTRING_END)
from parso.parser import BaseParser
from parso.pgen2.parse import token_to_ilabel


class Parser(BaseParser):
    """
    This class is used to parse a Python file, it then divides them into a
    class structure of different scopes.

    :param pgen_grammar: The grammar object of pgen2. Loaded by load_grammar.
    """

    node_map = {
        'expr_stmt': tree.ExprStmt,
        'classdef': tree.Class,
        'funcdef': tree.Function,
        'file_input': tree.Module,
        'import_name': tree.ImportName,
        'import_from': tree.ImportFrom,
        'break_stmt': tree.KeywordStatement,
        'continue_stmt': tree.KeywordStatement,
        'return_stmt': tree.ReturnStmt,
        'raise_stmt': tree.KeywordStatement,
        'yield_expr': tree.YieldExpr,
        'del_stmt': tree.KeywordStatement,
        'pass_stmt': tree.KeywordStatement,
        'global_stmt': tree.GlobalStmt,
        'nonlocal_stmt': tree.KeywordStatement,
        'print_stmt': tree.KeywordStatement,
        'assert_stmt': tree.AssertStmt,
        'if_stmt': tree.IfStmt,
        'with_stmt': tree.WithStmt,
        'for_stmt': tree.ForStmt,
        'while_stmt': tree.WhileStmt,
        'try_stmt': tree.TryStmt,
        'comp_for': tree.CompFor,
        # Not sure if this is the best idea, but IMO it's the easiest way to
        # avoid extreme amounts of work around the subtle difference of 2/3
        # grammar in list comoprehensions.
        'list_for': tree.CompFor,
        # Same here. This just exists in Python 2.6.
        'gen_for': tree.CompFor,
        'decorator': tree.Decorator,
        'lambdef': tree.Lambda,
        'old_lambdef': tree.Lambda,
        'lambdef_nocond': tree.Lambda,
    }
    default_node = tree.PythonNode

    # Names/Keywords are handled separately
    _leaf_map = {
        STRING: tree.String,
        NUMBER: tree.Number,
        NEWLINE: tree.Newline,
        ENDMARKER: tree.EndMarker,
        FSTRING_STRING: tree.FStringString,
        FSTRING_START: tree.FStringStart,
        FSTRING_END: tree.FStringEnd,
    }

    def __init__(self, pgen_grammar, error_recovery=True, start_symbol='file_input'):
        super(Parser, self).__init__(pgen_grammar, start_symbol, error_recovery=error_recovery)

        self.syntax_errors = []
        self._omit_dedent_list = []
        self._indent_counter = 0

        # TODO do print absolute import detection here.
        # try:
        #     del python_grammar_no_print_statement.keywords["print"]
        # except KeyError:
        #     pass  # Doesn't exist in the Python 3 grammar.

        # if self.options["print_function"]:
        #     python_grammar = pygram.python_grammar_no_print_statement
        # else:

    def parse(self, tokens):
        if self._error_recovery:
            if self._start_symbol != 'file_input':
                raise NotImplementedError

            tokens = self._recovery_tokenize(tokens)

        node = super(Parser, self).parse(tokens)

        if self._start_symbol == 'file_input' != node.type:
            # If there's only one statement, we get back a non-module. That's
            # not what we want, we want a module, so we add it here:
            node = self.convert_node(
                self._pgen_grammar,
                self._pgen_grammar.symbol2number['file_input'],
                [node]
            )

        return node

    def convert_node(self, pgen_grammar, type, children):
        """
        Convert raw node information to a PythonBaseNode instance.

        This is passed to the parser driver which calls it whenever a reduction of a
        grammar rule produces a new complete node, so that the tree is build
        strictly bottom-up.
        """
        # TODO REMOVE symbol, we don't want type here.
        symbol = pgen_grammar.number2symbol[type]
        try:
            return self.node_map[symbol](children)
        except KeyError:
            if symbol == 'suite':
                # We don't want the INDENT/DEDENT in our parser tree. Those
                # leaves are just cancer. They are virtual leaves and not real
                # ones and therefore have pseudo start/end positions and no
                # prefixes. Just ignore them.
                children = [children[0]] + children[2:-1]
            elif symbol == 'list_if':
                # Make transitioning from 2 to 3 easier.
                symbol = 'comp_if'
            elif symbol == 'listmaker':
                # Same as list_if above.
                symbol = 'testlist_comp'
            return self.default_node(symbol, children)

    def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
        # print('leaf', repr(value), token.tok_name[type])
        if type == NAME:
            if value in pgen_grammar.keywords:
                return tree.Keyword(value, start_pos, prefix)
            else:
                return tree.Name(value, start_pos, prefix)

        return self._leaf_map.get(type, tree.Operator)(value, start_pos, prefix)

    def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
                       add_token_callback):
        def get_symbol_and_nodes(stack):
            for dfa, state, (type_, nodes) in stack:
                symbol = pgen_grammar.number2symbol[type_]
                yield symbol, nodes

        tos_nodes = stack.get_tos_nodes()
        if tos_nodes:
            last_leaf = tos_nodes[-1].get_last_leaf()
        else:
            last_leaf = None

        if self._start_symbol == 'file_input' and \
                (typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value):
            def reduce_stack(states, newstate):
                # reduce
                state = newstate
                while states[state] == [(0, state)]:
                    self.pgen_parser._pop()

                    dfa, state, (type_, nodes) = stack[-1]
                    states, first = dfa

            # In Python statements need to end with a newline. But since it's
            # possible (and valid in Python ) that there's no newline at the
            # end of a file, we have to recover even if the user doesn't want
            # error recovery.
            #print('x', pprint.pprint(stack))
            ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value)

            dfa, state, (type_, nodes) = stack[-1]
            symbol = pgen_grammar.number2symbol[type_]
            states, first = dfa
            arcs = states[state]
            # Look for a state with this label
            for i, newstate in arcs:
                if ilabel == i:
                    if symbol == 'simple_stmt':
                        # This is basically shifting
                        stack[-1] = (dfa, newstate, (type_, nodes))

                        reduce_stack(states, newstate)
                        add_token_callback(typ, value, start_pos, prefix)
                        return
                    # Check if we're at the right point
                    #for symbol, nodes in get_symbol_and_nodes(stack):
                    #        self.pgen_parser._pop()

                            #break
                    break
            #symbol = pgen_grammar.number2symbol[type_]

        if not self._error_recovery:
            return super(Parser, self).error_recovery(
                pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
                add_token_callback)

        def current_suite(stack):
            # For now just discard everything that is not a suite or
            # file_input, if we detect an error.
            one_line_suite = False
            for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))):
                # `suite` can sometimes be only simple_stmt, not stmt.
                if one_line_suite:
                    break
                elif symbol == 'file_input':
                    break
                elif symbol == 'suite':
                    if len(nodes) > 1:
                        break
                    elif not nodes:
                        one_line_suite = True
                    # `suite` without an indent are error nodes.
            return index, symbol, nodes

        index, symbol, nodes = current_suite(stack)

        # print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
        if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos):
            add_token_callback(typ, value, start_pos, prefix)
        else:
            if typ == INDENT:
                # For every deleted INDENT we have to delete a DEDENT as well.
                # Otherwise the parser will get into trouble and DEDENT too early.
                self._omit_dedent_list.append(self._indent_counter)

            error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix)
            stack[-1][2][1].append(error_leaf)

        if symbol == 'suite':
            dfa, state, node = stack[-1]
            states, first = dfa
            arcs = states[state]
            intended_label = pgen_grammar.symbol2label['stmt']
            # Introduce a proper state transition. We're basically allowing
            # there to be no valid statements inside a suite.
            if [x[0] for x in arcs] == [intended_label]:
                new_state = arcs[0][1]
                stack[-1] = dfa, new_state, node

    def _stack_removal(self, pgen_grammar, stack, arcs, start_index, value, start_pos):
        failed_stack = False
        found = False
        all_nodes = []
        for dfa, state, (type_, nodes) in stack[start_index:]:
            if nodes:
                found = True
            if found:
                failed_stack = True
                all_nodes += nodes
        if failed_stack:
            stack[start_index - 1][2][1].append(tree.PythonErrorNode(all_nodes))

        stack[start_index:] = []
        return failed_stack

    def _recovery_tokenize(self, tokens):
        for typ, value, start_pos, prefix in tokens:
            # print(tok_name[typ], repr(value), start_pos, repr(prefix))
            if typ == DEDENT:
                # We need to count indents, because if we just omit any DEDENT,
                # we might omit them in the wrong place.
                o = self._omit_dedent_list
                if o and o[-1] == self._indent_counter:
                    o.pop()
                    continue

                self._indent_counter -= 1
            elif typ == INDENT:
                self._indent_counter += 1
            yield typ, value, start_pos, prefix