From 73ce57428be069d2c34eb1aae0325e0dc7cbd997 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Sun, 17 Jun 2018 18:30:20 +0200 Subject: [PATCH] Try to completely remove the word symbol and use nonterminal The ones that we could not remove are in grammar.py, because that's the public documented API. --- parso/grammar.py | 12 ++++----- parso/parser.py | 14 +++++----- parso/pgen2/grammar.py | 36 +++++++++++++------------- parso/pgen2/parse.py | 10 ++++---- parso/pgen2/pgen.py | 35 ++++++++++++------------- parso/python/diff.py | 10 ++++---- parso/python/parser.py | 58 +++++++++++++++++++++--------------------- 7 files changed, 89 insertions(+), 86 deletions(-) diff --git a/parso/grammar.py b/parso/grammar.py index 6c13f00..2906b5d 100644 --- a/parso/grammar.py +++ b/parso/grammar.py @@ -51,8 +51,8 @@ class Grammar(object): it is invalid, it will be returned as an error node. If disabled, you will get a ParseError when encountering syntax errors in your code. - :param str start_symbol: The grammar symbol that you want to parse. Only - allowed to be used when error_recovery is False. + :param str start_symbol: The grammar rule (nonterminal) that you want + to parse. Only allowed to be used when error_recovery is False. :param str path: The path to the file you want to open. Only needed for caching. :param bool cache: Keeps a copy of the parser tree in RAM and on disk if a path is given. Returns the cached trees if the corresponding @@ -88,7 +88,7 @@ class Grammar(object): raise TypeError("Please provide either code or a path.") if start_symbol is None: - start_symbol = self._start_symbol + start_symbol = self._start_nonterminal if error_recovery and start_symbol != 'file_input': raise NotImplementedError("This is currently not implemented.") @@ -136,7 +136,7 @@ class Grammar(object): p = self._parser( self._pgen_grammar, error_recovery=error_recovery, - start_symbol=start_symbol + start_nonterminal=start_symbol ) root_node = p.parse(tokens=tokens) @@ -186,7 +186,7 @@ class Grammar(object): return normalizer.issues def __repr__(self): - labels = self._pgen_grammar.number2symbol.values() + labels = self._pgen_grammar.number2nonterminal.values() txt = ' '.join(list(labels)[:3]) + ' ...' return '<%s:%s>' % (self.__class__.__name__, txt) @@ -194,7 +194,7 @@ class Grammar(object): class PythonGrammar(Grammar): _error_normalizer_config = ErrorFinderConfig() _token_namespace = token - _start_symbol = 'file_input' + _start_nonterminal = 'file_input' def __init__(self, version_info, bnf_text): super(PythonGrammar, self).__init__( diff --git a/parso/parser.py b/parso/parser.py index 555ebc7..c9df89e 100644 --- a/parso/parser.py +++ b/parso/parser.py @@ -38,13 +38,13 @@ class BaseParser(object): } default_leaf = tree.Leaf - def __init__(self, pgen_grammar, start_symbol='file_input', error_recovery=False): + def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False): self._pgen_grammar = pgen_grammar - self._start_symbol = start_symbol + self._start_nonterminal = start_nonterminal self._error_recovery = error_recovery def parse(self, tokens): - start_number = self._pgen_grammar.symbol2number[self._start_symbol] + start_number = self._pgen_grammar.nonterminal2number[self._start_nonterminal] self.pgen_parser = PgenParser( self._pgen_grammar, self.convert_node, self.convert_leaf, self.error_recovery, start_number @@ -64,12 +64,12 @@ class BaseParser(object): raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf) def convert_node(self, pgen_grammar, type_, children): - # TODO REMOVE symbol, we don't want type here. - symbol = pgen_grammar.number2symbol[type_] + # TODO REMOVE nonterminal, we don't want type here. + nonterminal = pgen_grammar.number2nonterminal[type_] try: - return self.node_map[symbol](children) + return self.node_map[nonterminal](children) except KeyError: - return self.default_node(symbol, children) + return self.default_node(nonterminal, children) def convert_leaf(self, pgen_grammar, type_, value, prefix, start_pos): try: diff --git a/parso/pgen2/grammar.py b/parso/pgen2/grammar.py index 1a2c6e9..00a6e8c 100644 --- a/parso/pgen2/grammar.py +++ b/parso/pgen2/grammar.py @@ -28,12 +28,14 @@ class Grammar(object): The instance variables are as follows: - symbol2number -- a dict mapping symbol names to numbers. Symbol - numbers are always 256 or higher, to distinguish - them from token numbers, which are between 0 and - 255 (inclusive). + nonterminal2number -- + A dict mapping nonterminal names to numbers. + Nonterminal numbers are always 256 or higher, to + distinguish them from token numbers, which are between 0 + and 255 (inclusive). - number2symbol -- a dict mapping numbers to symbol names; + number2nonterminal -- + A dict mapping numbers to nonterminal names; these two are each other's inverse. states -- a list of DFAs, where each DFA is a list of @@ -44,20 +46,20 @@ class Grammar(object): Final states are represented by a special arc of the form (0, j) where j is its own state number. - dfas -- a dict mapping symbol numbers to (DFA, first) + dfas -- a dict mapping nonterminal numbers to (DFA, first) pairs, where DFA is an item from the states list above, and first is a set of tokens that can begin this grammar rule (represented by a dict whose values are always 1). labels -- a list of (x, y) pairs where x is either a token - number or a symbol number, and y is either None + number or a nonterminal number, and y is either None or a string; the strings are keywords. The label number is the index in this list; label numbers are used to mark state transitions (arcs) in the DFAs. - start -- the number of the grammar's start symbol. + start -- the number of the grammar's start nonterminal. keywords -- a dict mapping keyword strings to arc labels. @@ -65,29 +67,29 @@ class Grammar(object): """ - def __init__(self, bnf_text, start_symbol): - self.symbol2number = {} - self.number2symbol = {} + def __init__(self, bnf_text, start_nonterminal): + self.nonterminal2number = {} + self.number2nonterminal = {} self.states = [] self.dfas = {} self.labels = [(0, "EMPTY")] self.keywords = {} self.tokens = {} - self.symbol2label = {} - self.label2symbol = {} - self.start_symbol = start_symbol + self.nonterminal2label = {} + self.label2nonterminal = {} + self.start_nonterminal = start_nonterminal @property def start(self): - return self.symbol2number[self.start_symbol] + return self.nonterminal2number[self.start_nonterminal] def report(self): """Dump the grammar tables to standard output, for debugging.""" from pprint import pprint print("s2n") - pprint(self.symbol2number) + pprint(self.nonterminal2number) print("n2s") - pprint(self.number2symbol) + pprint(self.number2nonterminal) print("states") pprint(self.states) print("dfas") diff --git a/parso/pgen2/parse.py b/parso/pgen2/parse.py index e2d9593..4e1ad6c 100644 --- a/parso/pgen2/parse.py +++ b/parso/pgen2/parse.py @@ -118,8 +118,8 @@ class PgenParser(object): up. A concrete syntax tree node is a (type, nodes) tuple, where - type is the node type (a token or symbol number) and nodes - is a list of children for symbols, and None for tokens. + type is the node type (a token or nonterminal number) and nodes + is a list of children for nonterminals, and None for tokens. An abstract syntax tree node may be anything; this is entirely up to the converter function. @@ -184,11 +184,11 @@ class PgenParser(object): # Done with this token return False elif t >= 256: - # See if it's a symbol and if we're in its first set + # See if it's a nonterminal and if we're in its first set itsdfa = _gram.dfas[t] itsstates, itsfirst = itsdfa if ilabel in itsfirst: - # Push a symbol + # Push a nonterminal _push(t, itsdfa, newstate) break # To continue the outer while loop else: @@ -231,7 +231,7 @@ class PgenParser(object): try: # Equal to: # dfa, state, node = self.stack[-1] - # symbol, children = node + # nonterminal, children = node self.stack[-1][2][1].append(newnode) except IndexError: # Stack is empty, set the rootnode. diff --git a/parso/pgen2/pgen.py b/parso/pgen2/pgen.py index 9d0988f..de1efcb 100644 --- a/parso/pgen2/pgen.py +++ b/parso/pgen2/pgen.py @@ -29,7 +29,8 @@ class ParserGenerator(object): self._nonterminal_to_dfas = rule_to_dfas def make_grammar(self, grammar): - self._first_terminals = {} # map from symbol name to set of tokens + # Map from grammar rule (nonterminal) name to a set of tokens. + self._first_terminals = {} names = list(self._nonterminal_to_dfas.keys()) names.sort() @@ -37,9 +38,9 @@ class ParserGenerator(object): if name not in self._first_terminals: self._calculate_first_terminals(name) - i = 256 + len(grammar.symbol2number) - grammar.symbol2number[name] = i - grammar.number2symbol[i] = name + i = 256 + len(grammar.nonterminal2number) + grammar.nonterminal2number[name] = i + grammar.number2nonterminal[i] = name # Now that we have calculated the first terminals, we are sure that # there is no left recursion or ambiguities. @@ -55,7 +56,7 @@ class ParserGenerator(object): arcs.append((0, dfas.index(state))) states.append(arcs) grammar.states.append(states) - grammar.dfas[grammar.symbol2number[name]] = (states, self._make_first(grammar, name)) + grammar.dfas[grammar.nonterminal2number[name]] = (states, self._make_first(grammar, name)) return grammar def _make_first(self, grammar, name): @@ -71,15 +72,15 @@ class ParserGenerator(object): # XXX Maybe this should be a method on a subclass of converter? ilabel = len(grammar.labels) if label[0].isalpha(): - # Either a symbol name or a named token - if label in grammar.symbol2number: - # A symbol name (a non-terminal) - if label in grammar.symbol2label: - return grammar.symbol2label[label] + # Either a nonterminal name or a named token + if label in grammar.nonterminal2number: + # A nonterminal name (a non-terminal) + if label in grammar.nonterminal2label: + return grammar.nonterminal2label[label] else: - grammar.labels.append((grammar.symbol2number[label], None)) - grammar.symbol2label[label] = ilabel - grammar.label2symbol[ilabel] = label + grammar.labels.append((grammar.nonterminal2number[label], None)) + grammar.nonterminal2label[label] = ilabel + grammar.label2nonterminal[ilabel] = label return ilabel else: # A named token (NAME, NUMBER, STRING) @@ -293,7 +294,7 @@ def generate_grammar(bnf_grammar, token_namespace): own parser. """ rule_to_dfas = {} - start_symbol = None + start_nonterminal = None for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse(): #_dump_nfa(a, z) dfas = _make_dfas(nfa_a, nfa_z) @@ -304,8 +305,8 @@ def generate_grammar(bnf_grammar, token_namespace): rule_to_dfas[nfa_a.from_rule] = dfas #print(nfa_a.from_rule, oldlen, newlen) - if start_symbol is None: - start_symbol = nfa_a.from_rule + if start_nonterminal is None: + start_nonterminal = nfa_a.from_rule p = ParserGenerator(rule_to_dfas, token_namespace) - return p.make_grammar(Grammar(bnf_grammar, start_symbol)) + return p.make_grammar(Grammar(bnf_grammar, start_nonterminal)) diff --git a/parso/python/diff.py b/parso/python/diff.py index f8b73c7..529f06a 100644 --- a/parso/python/diff.py +++ b/parso/python/diff.py @@ -41,9 +41,9 @@ def _flows_finished(pgen_grammar, stack): if, while, for and try might not be finished, because another part might still be parsed. """ - for dfa, newstate, (symbol_number, nodes) in stack: - if pgen_grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt', - 'for_stmt', 'try_stmt'): + for dfa, newstate, (nonterminal_number, nodes) in stack: + if pgen_grammar.number2nonterminal[nonterminal_number] \ + in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'): return False return True @@ -52,8 +52,8 @@ def suite_or_file_input_is_valid(pgen_grammar, stack): if not _flows_finished(pgen_grammar, stack): return False - for dfa, newstate, (symbol_number, nodes) in reversed(stack): - if pgen_grammar.number2symbol[symbol_number] == 'suite': + for dfa, newstate, (nonterminal_number, nodes) in reversed(stack): + if pgen_grammar.number2nonterminal[nonterminal_number] == 'suite': # If only newline is in the suite, the suite is not valid, yet. return len(nodes) > 1 # Not reaching a suite means that we're dealing with file_input levels diff --git a/parso/python/parser.py b/parso/python/parser.py index b99053b..cb283e8 100644 --- a/parso/python/parser.py +++ b/parso/python/parser.py @@ -62,8 +62,8 @@ class Parser(BaseParser): FSTRING_END: tree.FStringEnd, } - def __init__(self, pgen_grammar, error_recovery=True, start_symbol='file_input'): - super(Parser, self).__init__(pgen_grammar, start_symbol, error_recovery=error_recovery) + def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'): + super(Parser, self).__init__(pgen_grammar, start_nonterminal, error_recovery=error_recovery) self.syntax_errors = [] self._omit_dedent_list = [] @@ -81,19 +81,19 @@ class Parser(BaseParser): def parse(self, tokens): if self._error_recovery: - if self._start_symbol != 'file_input': + if self._start_nonterminal != 'file_input': raise NotImplementedError tokens = self._recovery_tokenize(tokens) node = super(Parser, self).parse(tokens) - if self._start_symbol == 'file_input' != node.type: + if self._start_nonterminal == 'file_input' != node.type: # If there's only one statement, we get back a non-module. That's # not what we want, we want a module, so we add it here: node = self.convert_node( self._pgen_grammar, - self._pgen_grammar.symbol2number['file_input'], + self._pgen_grammar.nonterminal2number['file_input'], [node] ) @@ -107,24 +107,24 @@ class Parser(BaseParser): grammar rule produces a new complete node, so that the tree is build strictly bottom-up. """ - # TODO REMOVE symbol, we don't want type here. - symbol = pgen_grammar.number2symbol[type] + # TODO REMOVE nonterminal, we don't want type here. + nonterminal = pgen_grammar.number2nonterminal[type] try: - return self.node_map[symbol](children) + return self.node_map[nonterminal](children) except KeyError: - if symbol == 'suite': + if nonterminal == 'suite': # We don't want the INDENT/DEDENT in our parser tree. Those # leaves are just cancer. They are virtual leaves and not real # ones and therefore have pseudo start/end positions and no # prefixes. Just ignore them. children = [children[0]] + children[2:-1] - elif symbol == 'list_if': + elif nonterminal == 'list_if': # Make transitioning from 2 to 3 easier. - symbol = 'comp_if' - elif symbol == 'listmaker': + nonterminal = 'comp_if' + elif nonterminal == 'listmaker': # Same as list_if above. - symbol = 'testlist_comp' - return self.default_node(symbol, children) + nonterminal = 'testlist_comp' + return self.default_node(nonterminal, children) def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos): # print('leaf', repr(value), token.tok_name[type]) @@ -138,10 +138,10 @@ class Parser(BaseParser): def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix, add_token_callback): - def get_symbol_and_nodes(stack): + def get_nonterminal_and_nodes(stack): for dfa, state, (type_, nodes) in stack: - symbol = pgen_grammar.number2symbol[type_] - yield symbol, nodes + nonterminal = pgen_grammar.number2nonterminal[type_] + yield nonterminal, nodes tos_nodes = stack.get_tos_nodes() if tos_nodes: @@ -149,7 +149,7 @@ class Parser(BaseParser): else: last_leaf = None - if self._start_symbol == 'file_input' and \ + if self._start_nonterminal == 'file_input' and \ (typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value): def reduce_stack(states, newstate): # reduce @@ -168,13 +168,13 @@ class Parser(BaseParser): ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value) dfa, state, (type_, nodes) = stack[-1] - symbol = pgen_grammar.number2symbol[type_] + nonterminal = pgen_grammar.number2nonterminal[type_] states, first = dfa arcs = states[state] # Look for a state with this label for i, newstate in arcs: if ilabel == i: - if symbol == 'simple_stmt': + if nonterminal == 'simple_stmt': # This is basically shifting stack[-1] = (dfa, newstate, (type_, nodes)) @@ -182,12 +182,12 @@ class Parser(BaseParser): add_token_callback(typ, value, start_pos, prefix) return # Check if we're at the right point - #for symbol, nodes in get_symbol_and_nodes(stack): + #for nonterminal, nodes in get_nonterminal_and_nodes(stack): # self.pgen_parser._pop() #break break - #symbol = pgen_grammar.number2symbol[type_] + #nonterminal = pgen_grammar.number2nonterminal[type_] if not self._error_recovery: return super(Parser, self).error_recovery( @@ -198,21 +198,21 @@ class Parser(BaseParser): # For now just discard everything that is not a suite or # file_input, if we detect an error. one_line_suite = False - for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))): + for index, (nonterminal, nodes) in reversed(list(enumerate(get_nonterminal_and_nodes(stack)))): # `suite` can sometimes be only simple_stmt, not stmt. if one_line_suite: break - elif symbol == 'file_input': + elif nonterminal == 'file_input': break - elif symbol == 'suite': + elif nonterminal == 'suite': if len(nodes) > 1: break elif not nodes: one_line_suite = True # `suite` without an indent are error nodes. - return index, symbol, nodes + return index, nonterminal, nodes - index, symbol, nodes = current_suite(stack) + index, nonterminal, nodes = current_suite(stack) # print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index) if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos): @@ -226,11 +226,11 @@ class Parser(BaseParser): error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix) stack[-1][2][1].append(error_leaf) - if symbol == 'suite': + if nonterminal == 'suite': dfa, state, node = stack[-1] states, first = dfa arcs = states[state] - intended_label = pgen_grammar.symbol2label['stmt'] + intended_label = pgen_grammar.nonterminal2label['stmt'] # Introduce a proper state transition. We're basically allowing # there to be no valid statements inside a suite. if [x[0] for x in arcs] == [intended_label]: