Merge the PgenParser and our own parser

2026-02-20 00:28:52 +08:00 · 2018-06-27 23:45:04 +02:00
parent edce279dee
commit f7d3d4e82f
4 changed files with 142 additions and 212 deletions
--- a/parso/parser.py
+++ b/parso/parser.py
@@ -1,3 +1,11 @@
 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
 # Licensed to PSF under a Contributor Agreement.
 # Modifications:
 # Copyright David Halter and Contributors
 # Modifications are dual-licensed: MIT and PSF.
 # 99% of the code is different from pgen2, now.
 """
 The ``Parser`` tries to convert the available Python code in an easy to read
 format, something like an abstract syntax tree. The classes who represent this
@@ -16,7 +24,6 @@ complexity of the ``Parser`` (there's another parser sitting inside
 ``Statement``, which produces ``Array`` and ``Call``).
 """
 from parso import tree
 from parso.pgen2 import PgenParser
 class ParserSyntaxError(Exception):
@@ -30,7 +37,81 @@ class ParserSyntaxError(Exception):
        self.error_leaf = error_leaf
 class InternalParseError(Exception):
    """
    Exception to signal the parser is stuck and error recovery didn't help.
    Basically this shouldn't happen. It's a sign that something is really
    wrong.
    """
    def __init__(self, msg, type_, value, start_pos):
        Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
                           (msg, type_.name, value, start_pos))
        self.msg = msg
        self.type = type
        self.value = value
        self.start_pos = start_pos
 class Stack(list):
    def get_tos_nodes(self):
        tos = self[-1]
        return tos[2][1]
    def get_tos_first_tokens(self, grammar):
        tos = self[-1]
        inv_tokens = dict((v, k) for k, v in grammar.tokens.items())
        inv_keywords = dict((v, k) for k, v in grammar.keywords.items())
        dfa, state, nodes = tos
        def check():
            for first in dfa[1]:
                try:
                    yield inv_keywords[first]
                except KeyError:
                    yield tokenize.tok_name[inv_tokens[first]]
        return sorted(check())
 class StackNode(object):
    def __init__(self, dfa):
        self.dfa = dfa
        self.nodes = []
    @property
    def nonterminal(self):
        return self.dfa.from_rule
    def __repr__(self):
        return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
 def _token_to_transition(grammar, type_, value):
    # Map from token to label
    if type_.contains_syntax:
        # Check for reserved words (keywords)
        try:
            return grammar.reserved_syntax_strings[value]
        except KeyError:
            pass
    return type_
 class BaseParser(object):
    """Parser engine.
    A Parser instance contains state pertaining to the current token
    sequence, and should not be used concurrently by different threads
    to parse separate token sequences.
    See python/tokenize.py for how to get input tokens by a string.
    When a syntax error occurs, error_recovery() is called.
    """
    node_map = {}
    default_node = tree.Node
@@ -44,15 +125,21 @@ class BaseParser(object):
        self._error_recovery = error_recovery
    def parse(self, tokens):
-        self.pgen_parser = PgenParser(
+        first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0]
-            self._pgen_grammar, self.convert_node, self.convert_leaf,
+        self.stack = Stack([StackNode(first_dfa)])
            self.error_recovery, self._start_nonterminal
        )
-        node = self.pgen_parser.parse(tokens)
+        for type_, value, start_pos, prefix in tokens:
-        # The stack is empty now, we don't need it anymore.
+            self.add_token(type_, value, start_pos, prefix)
-        del self.pgen_parser
+
-        return node
+        while self.stack and self.stack[-1].dfa.is_final:
            self._pop()
        if self.stack:
            # We never broke out -- EOF is too soon -- Unfinished statement.
            # However, the error recovery might have added the token again, if
            # the stack is empty, we're fine.
            raise InternalParseError("incomplete input", type_, value, start_pos)
        return self.rootnode
    def error_recovery(self, pgen_grammar, stack, typ, value, start_pos, prefix,
                       add_token_callback):
@@ -73,3 +160,48 @@ class BaseParser(object):
            return self.leaf_map[type_](value, start_pos, prefix)
        except KeyError:
            return self.default_leaf(value, start_pos, prefix)
    def add_token(self, type_, value, start_pos, prefix):
        """Add a token; return True if this is the end of the program."""
        grammar = self._pgen_grammar
        stack = self.stack
        transition = _token_to_transition(grammar, type_, value)
        while True:
            try:
                plan = stack[-1].dfa.transition_to_plan[transition]
                break
            except KeyError:
                if stack[-1].dfa.is_final:
                    self._pop()
                else:
                    self.error_recovery(grammar, stack, type_,
                                        value, start_pos, prefix, self.add_token)
                    return
            except IndexError:
                raise InternalParseError("too much input", type_, value, start_pos)
        stack[-1].dfa = plan.next_dfa
        for push in plan.dfa_pushes:
            stack.append(StackNode(push))
        leaf = self.convert_leaf(grammar, type_, value, prefix, start_pos)
        stack[-1].nodes.append(leaf)
    def _pop(self):
        tos = self.stack.pop()
        # If there's exactly one child, return that child instead of
        # creating a new node.  We still create expr_stmt and
        # file_input though, because a lot of Jedi depends on its
        # logic.
        if len(tos.nodes) == 1:
            new_node = tos.nodes[0]
        else:
            new_node = self.convert_node(self._pgen_grammar, tos.dfa.from_rule, tos.nodes)
        try:
            self.stack[-1].nodes.append(new_node)
        except IndexError:
            # Stack is empty, set the rootnode.
            self.rootnode = new_node
--- a/parso/pgen2/init.py
+++ b/parso/pgen2/init.py
@@ -8,4 +8,3 @@
 # Modifications are dual-licensed: MIT and PSF.
 from parso.pgen2.generator import generate_grammar
 from parso.pgen2.parse import PgenParser
--- a/parso/pgen2/parse.py
+++ b/parso/pgen2/parse.py
@@ -1,201 +0,0 @@
 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
 # Licensed to PSF under a Contributor Agreement.
 # Modifications:
 # Copyright David Halter and Contributors
 # Modifications are dual-licensed: MIT and PSF.
 """
 Parser engine for the grammar tables generated by pgen.
 The grammar table must be loaded first.
 See Parser/parser.c in the Python distribution for additional info on
 how this parsing engine works.
 """
 class InternalParseError(Exception):
    """
    Exception to signal the parser is stuck and error recovery didn't help.
    Basically this shouldn't happen. It's a sign that something is really
    wrong.
    """
    def __init__(self, msg, type_, value, start_pos):
        Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
                           (msg, type_.name, value, start_pos))
        self.msg = msg
        self.type = type
        self.value = value
        self.start_pos = start_pos
 class Stack(list):
    def get_tos_nodes(self):
        tos = self[-1]
        return tos[2][1]
    def get_tos_first_tokens(self, grammar):
        tos = self[-1]
        inv_tokens = dict((v, k) for k, v in grammar.tokens.items())
        inv_keywords = dict((v, k) for k, v in grammar.keywords.items())
        dfa, state, nodes = tos
        def check():
            for first in dfa[1]:
                try:
                    yield inv_keywords[first]
                except KeyError:
                    yield tokenize.tok_name[inv_tokens[first]]
        return sorted(check())
 class StackNode(object):
    def __init__(self, dfa):
        self.dfa = dfa
        self.nodes = []
    @property
    def nonterminal(self):
        return self.dfa.from_rule
    def __repr__(self):
        return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
 def _token_to_transition(grammar, type_, value):
    # Map from token to label
    if type_.contains_syntax:
        # Check for reserved words (keywords)
        try:
            return grammar.reserved_syntax_strings[value]
        except KeyError:
            pass
    return type_
 class PgenParser(object):
    """Parser engine.
    The proper usage sequence is:
    p = Parser(grammar, [converter])  # create instance
    p.setup([start])                  # prepare for parsing
    <for each input token>:
        if p.add_token(...):           # parse a token
            break
    root = p.rootnode                 # root of abstract syntax tree
    A Parser instance may be reused by calling setup() repeatedly.
    A Parser instance contains state pertaining to the current token
    sequence, and should not be used concurrently by different threads
    to parse separate token sequences.
    See driver.py for how to get input tokens by tokenizing a file or
    string.
    Parsing is complete when add_token() returns True; the root of the
    abstract syntax tree can then be retrieved from the rootnode
    instance variable.  When a syntax error occurs, error_recovery()
    is called. There is no error recovery; the parser cannot be used
    after a syntax error was reported (but it can be reinitialized by
    calling setup()).
    """
    def __init__(self, grammar, convert_node, convert_leaf, error_recovery,
                 start_nonterminal):
        """Constructor.
        The grammar argument is a grammar.Grammar instance; see the
        grammar module for more information.
        The parser is not ready yet for parsing; you must call the
        setup() method to get it started.
        The optional convert argument is a function mapping concrete
        syntax tree nodes to abstract syntax tree nodes.  If not
        given, no conversion is done and the syntax tree produced is
        the concrete syntax tree.  If given, it must be a function of
        two arguments, the first being the grammar (a grammar.Grammar
        instance), and the second being the concrete syntax tree node
        to be converted.  The syntax tree is converted from the bottom
        up.
        A concrete syntax tree node is a (type, nodes) tuple, where
        type is the node type (a token or nonterminal number) and nodes
        is a list of children for nonterminals, and None for tokens.
        An abstract syntax tree node may be anything; this is entirely
        up to the converter function.
        """
        self.grammar = grammar
        self.convert_node = convert_node
        self.convert_leaf = convert_leaf
        self.stack = Stack([StackNode(grammar.nonterminal_to_dfas[start_nonterminal][0])])
        self.error_recovery = error_recovery
    def parse(self, tokens):
        for type_, value, start_pos, prefix in tokens:
            self.add_token(type_, value, start_pos, prefix)
        while self.stack and self.stack[-1].dfa.is_final:
            self._pop()
        if self.stack:
            # We never broke out -- EOF is too soon -- Unfinished statement.
            # However, the error recovery might have added the token again, if
            # the stack is empty, we're fine.
            raise InternalParseError("incomplete input", type_, value, start_pos)
        return self.rootnode
    def add_token(self, type_, value, start_pos, prefix):
        """Add a token; return True if this is the end of the program."""
        transition = _token_to_transition(self.grammar, type_, value)
        stack = self.stack
        grammar = self.grammar
        while True:
            try:
                plan = stack[-1].dfa.transition_to_plan[transition]
                break
            except KeyError:
                if stack[-1].dfa.is_final:
                    self._pop()
                else:
                    self.error_recovery(grammar, stack, type_,
                                        value, start_pos, prefix, self.add_token)
                    return
            except IndexError:
                raise InternalParseError("too much input", type_, value, start_pos)
        stack[-1].dfa = plan.next_dfa
        for push in plan.dfa_pushes:
            stack.append(StackNode(push))
        leaf = self.convert_leaf(grammar, type_, value, prefix, start_pos)
        stack[-1].nodes.append(leaf)
    def _pop(self):
        tos = self.stack.pop()
        # If there's exactly one child, return that child instead of
        # creating a new node.  We still create expr_stmt and
        # file_input though, because a lot of Jedi depends on its
        # logic.
        if len(tos.nodes) == 1:
            new_node = tos.nodes[0]
        else:
            new_node = self.convert_node(self.grammar, tos.dfa.from_rule, tos.nodes)
        try:
            self.stack[-1].nodes.append(new_node)
        except IndexError:
            # Stack is empty, set the rootnode.
            self.rootnode = new_node
--- a/parso/python/diff.py
+++ b/parso/python/diff.py
@@ -287,7 +287,7 @@ class DiffParser(object):
        omitted_first_indent = False
        indents = []
        tokens = self._tokenizer(lines, (1, 0))
-        stack = self._active_parser.pgen_parser.stack
+        stack = self._active_parser.stack
        for typ, string, start_pos, prefix in tokens:
            start_pos = start_pos[0] + line_offset, start_pos[1]
            if typ == PythonTokenTypes.INDENT: