Try to completely remove the word symbol and use nonterminal

The ones that we could not remove are in grammar.py, because that's the public documented API.
2025-12-09 14:14:53 +08:00 · 2018-06-17 18:30:20 +02:00
parent 640f544af9
commit 73ce57428b
7 changed files with 89 additions and 86 deletions
--- a/parso/grammar.py
+++ b/parso/grammar.py
@@ -51,8 +51,8 @@ class Grammar(object):
            it is invalid, it will be returned as an error node. If disabled,
            you will get a ParseError when encountering syntax errors in your
            code.
-        :param str start_symbol: The grammar symbol that you want to parse. Only
-            allowed to be used when error_recovery is False.
+        :param str start_symbol: The grammar rule (nonterminal) that you want
+            to parse. Only allowed to be used when error_recovery is False.
        :param str path: The path to the file you want to open. Only needed for caching.
        :param bool cache: Keeps a copy of the parser tree in RAM and on disk
            if a path is given. Returns the cached trees if the corresponding
@@ -88,7 +88,7 @@ class Grammar(object):
            raise TypeError("Please provide either code or a path.")

        if start_symbol is None:
-            start_symbol = self._start_symbol
+            start_symbol = self._start_nonterminal

        if error_recovery and start_symbol != 'file_input':
            raise NotImplementedError("This is currently not implemented.")
@@ -136,7 +136,7 @@ class Grammar(object):
        p = self._parser(
            self._pgen_grammar,
            error_recovery=error_recovery,
-            start_symbol=start_symbol
+            start_nonterminal=start_symbol
        )
        root_node = p.parse(tokens=tokens)

@@ -186,7 +186,7 @@ class Grammar(object):
        return normalizer.issues

    def __repr__(self):
-        labels = self._pgen_grammar.number2symbol.values()
+        labels = self._pgen_grammar.number2nonterminal.values()
        txt = ' '.join(list(labels)[:3]) + ' ...'
        return '<%s:%s>' % (self.__class__.__name__, txt)

@@ -194,7 +194,7 @@ class Grammar(object):
 class PythonGrammar(Grammar):
    _error_normalizer_config = ErrorFinderConfig()
    _token_namespace = token
-    _start_symbol = 'file_input'
+    _start_nonterminal = 'file_input'

    def __init__(self, version_info, bnf_text):
        super(PythonGrammar, self).__init__(
--- a/parso/parser.py
+++ b/parso/parser.py
@@ -38,13 +38,13 @@ class BaseParser(object):
    }
    default_leaf = tree.Leaf

-    def __init__(self, pgen_grammar, start_symbol='file_input', error_recovery=False):
+    def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False):
        self._pgen_grammar = pgen_grammar
-        self._start_symbol = start_symbol
+        self._start_nonterminal = start_nonterminal
        self._error_recovery = error_recovery

    def parse(self, tokens):
-        start_number = self._pgen_grammar.symbol2number[self._start_symbol]
+        start_number = self._pgen_grammar.nonterminal2number[self._start_nonterminal]
        self.pgen_parser = PgenParser(
            self._pgen_grammar, self.convert_node, self.convert_leaf,
            self.error_recovery, start_number
@@ -64,12 +64,12 @@ class BaseParser(object):
            raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf)

    def convert_node(self, pgen_grammar, type_, children):
-        # TODO REMOVE symbol, we don't want type here.
-        symbol = pgen_grammar.number2symbol[type_]
+        # TODO REMOVE nonterminal, we don't want type here.
+        nonterminal = pgen_grammar.number2nonterminal[type_]
        try:
-            return self.node_map[symbol](children)
+            return self.node_map[nonterminal](children)
        except KeyError:
-            return self.default_node(symbol, children)
+            return self.default_node(nonterminal, children)

    def convert_leaf(self, pgen_grammar, type_, value, prefix, start_pos):
        try:
--- a/parso/pgen2/grammar.py
+++ b/parso/pgen2/grammar.py
@@ -28,12 +28,14 @@ class Grammar(object):

    The instance variables are as follows:

-    symbol2number -- a dict mapping symbol names to numbers.  Symbol
-                     numbers are always 256 or higher, to distinguish
-                     them from token numbers, which are between 0 and
-                     255 (inclusive).
+    nonterminal2number --
+                     A dict mapping nonterminal names to numbers.
+                     Nonterminal numbers are always 256 or higher, to
+                     distinguish them from token numbers, which are between 0
+                     and 255 (inclusive).

-    number2symbol -- a dict mapping numbers to symbol names;
+    number2nonterminal --
+                     A dict mapping numbers to nonterminal names;
                     these two are each other's inverse.

    states        -- a list of DFAs, where each DFA is a list of
@@ -44,20 +46,20 @@ class Grammar(object):
                     Final states are represented by a special arc of
                     the form (0, j) where j is its own state number.

-    dfas          -- a dict mapping symbol numbers to (DFA, first)
+    dfas          -- a dict mapping nonterminal numbers to (DFA, first)
                     pairs, where DFA is an item from the states list
                     above, and first is a set of tokens that can
                     begin this grammar rule (represented by a dict
                     whose values are always 1).

    labels        -- a list of (x, y) pairs where x is either a token
-                     number or a symbol number, and y is either None
+                     number or a nonterminal number, and y is either None
                     or a string; the strings are keywords.  The label
                     number is the index in this list; label numbers
                     are used to mark state transitions (arcs) in the
                     DFAs.

-    start         -- the number of the grammar's start symbol.
+    start         -- the number of the grammar's start nonterminal.

    keywords      -- a dict mapping keyword strings to arc labels.

@@ -65,29 +67,29 @@ class Grammar(object):

    """

-    def __init__(self, bnf_text, start_symbol):
-        self.symbol2number = {}
-        self.number2symbol = {}
+    def __init__(self, bnf_text, start_nonterminal):
+        self.nonterminal2number = {}
+        self.number2nonterminal = {}
        self.states = []
        self.dfas = {}
        self.labels = [(0, "EMPTY")]
        self.keywords = {}
        self.tokens = {}
-        self.symbol2label = {}
-        self.label2symbol = {}
-        self.start_symbol = start_symbol
+        self.nonterminal2label = {}
+        self.label2nonterminal = {}
+        self.start_nonterminal = start_nonterminal

    @property
    def start(self):
-        return self.symbol2number[self.start_symbol]
+        return self.nonterminal2number[self.start_nonterminal]

    def report(self):
        """Dump the grammar tables to standard output, for debugging."""
        from pprint import pprint
        print("s2n")
-        pprint(self.symbol2number)
+        pprint(self.nonterminal2number)
        print("n2s")
-        pprint(self.number2symbol)
+        pprint(self.number2nonterminal)
        print("states")
        pprint(self.states)
        print("dfas")
--- a/parso/pgen2/parse.py
+++ b/parso/pgen2/parse.py
@@ -118,8 +118,8 @@ class PgenParser(object):
        up.

        A concrete syntax tree node is a (type, nodes) tuple, where
-        type is the node type (a token or symbol number) and nodes
-        is a list of children for symbols, and None for tokens.
+        type is the node type (a token or nonterminal number) and nodes
+        is a list of children for nonterminals, and None for tokens.

        An abstract syntax tree node may be anything; this is entirely
        up to the converter function.
@@ -184,11 +184,11 @@ class PgenParser(object):
                    # Done with this token
                    return False
                elif t >= 256:
-                    # See if it's a symbol and if we're in its first set
+                    # See if it's a nonterminal and if we're in its first set
                    itsdfa = _gram.dfas[t]
                    itsstates, itsfirst = itsdfa
                    if ilabel in itsfirst:
-                        # Push a symbol
+                        # Push a nonterminal
                        _push(t, itsdfa, newstate)
                        break  # To continue the outer while loop
            else:
@@ -231,7 +231,7 @@ class PgenParser(object):
        try:
            # Equal to:
            # dfa, state, node = self.stack[-1]
-            # symbol, children = node
+            # nonterminal, children = node
            self.stack[-1][2][1].append(newnode)
        except IndexError:
            # Stack is empty, set the rootnode.
--- a/parso/pgen2/pgen.py
+++ b/parso/pgen2/pgen.py
@@ -29,7 +29,8 @@ class ParserGenerator(object):
        self._nonterminal_to_dfas = rule_to_dfas

    def make_grammar(self, grammar):
-        self._first_terminals = {}  # map from symbol name to set of tokens
+        # Map from grammar rule (nonterminal) name to a set of tokens.
+        self._first_terminals = {}

        names = list(self._nonterminal_to_dfas.keys())
        names.sort()
@@ -37,9 +38,9 @@ class ParserGenerator(object):
            if name not in self._first_terminals:
                self._calculate_first_terminals(name)

-            i = 256 + len(grammar.symbol2number)
-            grammar.symbol2number[name] = i
-            grammar.number2symbol[i] = name
+            i = 256 + len(grammar.nonterminal2number)
+            grammar.nonterminal2number[name] = i
+            grammar.number2nonterminal[i] = name

        # Now that we have calculated the first terminals, we are sure that
        # there is no left recursion or ambiguities.
@@ -55,7 +56,7 @@ class ParserGenerator(object):
                    arcs.append((0, dfas.index(state)))
                states.append(arcs)
            grammar.states.append(states)
-            grammar.dfas[grammar.symbol2number[name]] = (states, self._make_first(grammar, name))
+            grammar.dfas[grammar.nonterminal2number[name]] = (states, self._make_first(grammar, name))
        return grammar

    def _make_first(self, grammar, name):
@@ -71,15 +72,15 @@ class ParserGenerator(object):
        # XXX Maybe this should be a method on a subclass of converter?
        ilabel = len(grammar.labels)
        if label[0].isalpha():
-            # Either a symbol name or a named token
-            if label in grammar.symbol2number:
-                # A symbol name (a non-terminal)
-                if label in grammar.symbol2label:
-                    return grammar.symbol2label[label]
+            # Either a nonterminal name or a named token
+            if label in grammar.nonterminal2number:
+                # A nonterminal name (a non-terminal)
+                if label in grammar.nonterminal2label:
+                    return grammar.nonterminal2label[label]
                else:
-                    grammar.labels.append((grammar.symbol2number[label], None))
-                    grammar.symbol2label[label] = ilabel
-                    grammar.label2symbol[ilabel] = label
+                    grammar.labels.append((grammar.nonterminal2number[label], None))
+                    grammar.nonterminal2label[label] = ilabel
+                    grammar.label2nonterminal[ilabel] = label
                    return ilabel
            else:
                # A named token (NAME, NUMBER, STRING)
@@ -293,7 +294,7 @@ def generate_grammar(bnf_grammar, token_namespace):
    own parser.
    """
    rule_to_dfas = {}
-    start_symbol = None
+    start_nonterminal = None
    for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse():
        #_dump_nfa(a, z)
        dfas = _make_dfas(nfa_a, nfa_z)
@@ -304,8 +305,8 @@ def generate_grammar(bnf_grammar, token_namespace):
        rule_to_dfas[nfa_a.from_rule] = dfas
        #print(nfa_a.from_rule, oldlen, newlen)

-        if start_symbol is None:
-            start_symbol = nfa_a.from_rule
+        if start_nonterminal is None:
+            start_nonterminal = nfa_a.from_rule

    p = ParserGenerator(rule_to_dfas, token_namespace)
-    return p.make_grammar(Grammar(bnf_grammar, start_symbol))
+    return p.make_grammar(Grammar(bnf_grammar, start_nonterminal))
--- a/parso/python/diff.py
+++ b/parso/python/diff.py
@@ -41,9 +41,9 @@ def _flows_finished(pgen_grammar, stack):
    if, while, for and try might not be finished, because another part might
    still be parsed.
    """
-    for dfa, newstate, (symbol_number, nodes) in stack:
-        if pgen_grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt',
-                                                    'for_stmt', 'try_stmt'):
+    for dfa, newstate, (nonterminal_number, nodes) in stack:
+        if pgen_grammar.number2nonterminal[nonterminal_number] \
+                in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'):
            return False
    return True

@@ -52,8 +52,8 @@ def suite_or_file_input_is_valid(pgen_grammar, stack):
    if not _flows_finished(pgen_grammar, stack):
        return False

-    for dfa, newstate, (symbol_number, nodes) in reversed(stack):
-        if pgen_grammar.number2symbol[symbol_number] == 'suite':
+    for dfa, newstate, (nonterminal_number, nodes) in reversed(stack):
+        if pgen_grammar.number2nonterminal[nonterminal_number] == 'suite':
            # If only newline is in the suite, the suite is not valid, yet.
            return len(nodes) > 1
    # Not reaching a suite means that we're dealing with file_input levels
--- a/parso/python/parser.py
+++ b/parso/python/parser.py
@@ -62,8 +62,8 @@ class Parser(BaseParser):
        FSTRING_END: tree.FStringEnd,
    }

-    def __init__(self, pgen_grammar, error_recovery=True, start_symbol='file_input'):
-        super(Parser, self).__init__(pgen_grammar, start_symbol, error_recovery=error_recovery)
+    def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'):
+        super(Parser, self).__init__(pgen_grammar, start_nonterminal, error_recovery=error_recovery)

        self.syntax_errors = []
        self._omit_dedent_list = []
@@ -81,19 +81,19 @@ class Parser(BaseParser):

    def parse(self, tokens):
        if self._error_recovery:
-            if self._start_symbol != 'file_input':
+            if self._start_nonterminal != 'file_input':
                raise NotImplementedError

            tokens = self._recovery_tokenize(tokens)

        node = super(Parser, self).parse(tokens)

-        if self._start_symbol == 'file_input' != node.type:
+        if self._start_nonterminal == 'file_input' != node.type:
            # If there's only one statement, we get back a non-module. That's
            # not what we want, we want a module, so we add it here:
            node = self.convert_node(
                self._pgen_grammar,
-                self._pgen_grammar.symbol2number['file_input'],
+                self._pgen_grammar.nonterminal2number['file_input'],
                [node]
            )

@@ -107,24 +107,24 @@ class Parser(BaseParser):
        grammar rule produces a new complete node, so that the tree is build
        strictly bottom-up.
        """
-        # TODO REMOVE symbol, we don't want type here.
-        symbol = pgen_grammar.number2symbol[type]
+        # TODO REMOVE nonterminal, we don't want type here.
+        nonterminal = pgen_grammar.number2nonterminal[type]
        try:
-            return self.node_map[symbol](children)
+            return self.node_map[nonterminal](children)
        except KeyError:
-            if symbol == 'suite':
+            if nonterminal == 'suite':
                # We don't want the INDENT/DEDENT in our parser tree. Those
                # leaves are just cancer. They are virtual leaves and not real
                # ones and therefore have pseudo start/end positions and no
                # prefixes. Just ignore them.
                children = [children[0]] + children[2:-1]
-            elif symbol == 'list_if':
+            elif nonterminal == 'list_if':
                # Make transitioning from 2 to 3 easier.
-                symbol = 'comp_if'
-            elif symbol == 'listmaker':
+                nonterminal = 'comp_if'
+            elif nonterminal == 'listmaker':
                # Same as list_if above.
-                symbol = 'testlist_comp'
-            return self.default_node(symbol, children)
+                nonterminal = 'testlist_comp'
+            return self.default_node(nonterminal, children)

    def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
        # print('leaf', repr(value), token.tok_name[type])
@@ -138,10 +138,10 @@ class Parser(BaseParser):

    def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
                       add_token_callback):
-        def get_symbol_and_nodes(stack):
+        def get_nonterminal_and_nodes(stack):
            for dfa, state, (type_, nodes) in stack:
-                symbol = pgen_grammar.number2symbol[type_]
-                yield symbol, nodes
+                nonterminal = pgen_grammar.number2nonterminal[type_]
+                yield nonterminal, nodes

        tos_nodes = stack.get_tos_nodes()
        if tos_nodes:
@@ -149,7 +149,7 @@ class Parser(BaseParser):
        else:
            last_leaf = None

-        if self._start_symbol == 'file_input' and \
+        if self._start_nonterminal == 'file_input' and \
                (typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value):
            def reduce_stack(states, newstate):
                # reduce
@@ -168,13 +168,13 @@ class Parser(BaseParser):
            ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value)

            dfa, state, (type_, nodes) = stack[-1]
-            symbol = pgen_grammar.number2symbol[type_]
+            nonterminal = pgen_grammar.number2nonterminal[type_]
            states, first = dfa
            arcs = states[state]
            # Look for a state with this label
            for i, newstate in arcs:
                if ilabel == i:
-                    if symbol == 'simple_stmt':
+                    if nonterminal == 'simple_stmt':
                        # This is basically shifting
                        stack[-1] = (dfa, newstate, (type_, nodes))

@@ -182,12 +182,12 @@ class Parser(BaseParser):
                        add_token_callback(typ, value, start_pos, prefix)
                        return
                    # Check if we're at the right point
-                    #for symbol, nodes in get_symbol_and_nodes(stack):
+                    #for nonterminal, nodes in get_nonterminal_and_nodes(stack):
                    #        self.pgen_parser._pop()

                            #break
                    break
-            #symbol = pgen_grammar.number2symbol[type_]
+            #nonterminal = pgen_grammar.number2nonterminal[type_]

        if not self._error_recovery:
            return super(Parser, self).error_recovery(
@@ -198,21 +198,21 @@ class Parser(BaseParser):
            # For now just discard everything that is not a suite or
            # file_input, if we detect an error.
            one_line_suite = False
-            for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))):
+            for index, (nonterminal, nodes) in reversed(list(enumerate(get_nonterminal_and_nodes(stack)))):
                # `suite` can sometimes be only simple_stmt, not stmt.
                if one_line_suite:
                    break
-                elif symbol == 'file_input':
+                elif nonterminal == 'file_input':
                    break
-                elif symbol == 'suite':
+                elif nonterminal == 'suite':
                    if len(nodes) > 1:
                        break
                    elif not nodes:
                        one_line_suite = True
                    # `suite` without an indent are error nodes.
-            return index, symbol, nodes
+            return index, nonterminal, nodes

-        index, symbol, nodes = current_suite(stack)
+        index, nonterminal, nodes = current_suite(stack)

        # print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
        if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos):
@@ -226,11 +226,11 @@ class Parser(BaseParser):
            error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix)
            stack[-1][2][1].append(error_leaf)

-        if symbol == 'suite':
+        if nonterminal == 'suite':
            dfa, state, node = stack[-1]
            states, first = dfa
            arcs = states[state]
-            intended_label = pgen_grammar.symbol2label['stmt']
+            intended_label = pgen_grammar.nonterminal2label['stmt']
            # Introduce a proper state transition. We're basically allowing
            # there to be no valid statements inside a suite.
            if [x[0] for x in arcs] == [intended_label]: