Try to completely remove the word symbol and use nonterminal

The ones that we could not remove are in grammar.py, because that's the public documented API.
This commit is contained in:
Dave Halter
2018-06-17 18:30:20 +02:00
parent 640f544af9
commit 73ce57428b
7 changed files with 89 additions and 86 deletions

View File

@@ -51,8 +51,8 @@ class Grammar(object):
it is invalid, it will be returned as an error node. If disabled, it is invalid, it will be returned as an error node. If disabled,
you will get a ParseError when encountering syntax errors in your you will get a ParseError when encountering syntax errors in your
code. code.
:param str start_symbol: The grammar symbol that you want to parse. Only :param str start_symbol: The grammar rule (nonterminal) that you want
allowed to be used when error_recovery is False. to parse. Only allowed to be used when error_recovery is False.
:param str path: The path to the file you want to open. Only needed for caching. :param str path: The path to the file you want to open. Only needed for caching.
:param bool cache: Keeps a copy of the parser tree in RAM and on disk :param bool cache: Keeps a copy of the parser tree in RAM and on disk
if a path is given. Returns the cached trees if the corresponding if a path is given. Returns the cached trees if the corresponding
@@ -88,7 +88,7 @@ class Grammar(object):
raise TypeError("Please provide either code or a path.") raise TypeError("Please provide either code or a path.")
if start_symbol is None: if start_symbol is None:
start_symbol = self._start_symbol start_symbol = self._start_nonterminal
if error_recovery and start_symbol != 'file_input': if error_recovery and start_symbol != 'file_input':
raise NotImplementedError("This is currently not implemented.") raise NotImplementedError("This is currently not implemented.")
@@ -136,7 +136,7 @@ class Grammar(object):
p = self._parser( p = self._parser(
self._pgen_grammar, self._pgen_grammar,
error_recovery=error_recovery, error_recovery=error_recovery,
start_symbol=start_symbol start_nonterminal=start_symbol
) )
root_node = p.parse(tokens=tokens) root_node = p.parse(tokens=tokens)
@@ -186,7 +186,7 @@ class Grammar(object):
return normalizer.issues return normalizer.issues
def __repr__(self): def __repr__(self):
labels = self._pgen_grammar.number2symbol.values() labels = self._pgen_grammar.number2nonterminal.values()
txt = ' '.join(list(labels)[:3]) + ' ...' txt = ' '.join(list(labels)[:3]) + ' ...'
return '<%s:%s>' % (self.__class__.__name__, txt) return '<%s:%s>' % (self.__class__.__name__, txt)
@@ -194,7 +194,7 @@ class Grammar(object):
class PythonGrammar(Grammar): class PythonGrammar(Grammar):
_error_normalizer_config = ErrorFinderConfig() _error_normalizer_config = ErrorFinderConfig()
_token_namespace = token _token_namespace = token
_start_symbol = 'file_input' _start_nonterminal = 'file_input'
def __init__(self, version_info, bnf_text): def __init__(self, version_info, bnf_text):
super(PythonGrammar, self).__init__( super(PythonGrammar, self).__init__(

View File

@@ -38,13 +38,13 @@ class BaseParser(object):
} }
default_leaf = tree.Leaf default_leaf = tree.Leaf
def __init__(self, pgen_grammar, start_symbol='file_input', error_recovery=False): def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False):
self._pgen_grammar = pgen_grammar self._pgen_grammar = pgen_grammar
self._start_symbol = start_symbol self._start_nonterminal = start_nonterminal
self._error_recovery = error_recovery self._error_recovery = error_recovery
def parse(self, tokens): def parse(self, tokens):
start_number = self._pgen_grammar.symbol2number[self._start_symbol] start_number = self._pgen_grammar.nonterminal2number[self._start_nonterminal]
self.pgen_parser = PgenParser( self.pgen_parser = PgenParser(
self._pgen_grammar, self.convert_node, self.convert_leaf, self._pgen_grammar, self.convert_node, self.convert_leaf,
self.error_recovery, start_number self.error_recovery, start_number
@@ -64,12 +64,12 @@ class BaseParser(object):
raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf) raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf)
def convert_node(self, pgen_grammar, type_, children): def convert_node(self, pgen_grammar, type_, children):
# TODO REMOVE symbol, we don't want type here. # TODO REMOVE nonterminal, we don't want type here.
symbol = pgen_grammar.number2symbol[type_] nonterminal = pgen_grammar.number2nonterminal[type_]
try: try:
return self.node_map[symbol](children) return self.node_map[nonterminal](children)
except KeyError: except KeyError:
return self.default_node(symbol, children) return self.default_node(nonterminal, children)
def convert_leaf(self, pgen_grammar, type_, value, prefix, start_pos): def convert_leaf(self, pgen_grammar, type_, value, prefix, start_pos):
try: try:

View File

@@ -28,12 +28,14 @@ class Grammar(object):
The instance variables are as follows: The instance variables are as follows:
symbol2number -- a dict mapping symbol names to numbers. Symbol nonterminal2number --
numbers are always 256 or higher, to distinguish A dict mapping nonterminal names to numbers.
them from token numbers, which are between 0 and Nonterminal numbers are always 256 or higher, to
255 (inclusive). distinguish them from token numbers, which are between 0
and 255 (inclusive).
number2symbol -- a dict mapping numbers to symbol names; number2nonterminal --
A dict mapping numbers to nonterminal names;
these two are each other's inverse. these two are each other's inverse.
states -- a list of DFAs, where each DFA is a list of states -- a list of DFAs, where each DFA is a list of
@@ -44,20 +46,20 @@ class Grammar(object):
Final states are represented by a special arc of Final states are represented by a special arc of
the form (0, j) where j is its own state number. the form (0, j) where j is its own state number.
dfas -- a dict mapping symbol numbers to (DFA, first) dfas -- a dict mapping nonterminal numbers to (DFA, first)
pairs, where DFA is an item from the states list pairs, where DFA is an item from the states list
above, and first is a set of tokens that can above, and first is a set of tokens that can
begin this grammar rule (represented by a dict begin this grammar rule (represented by a dict
whose values are always 1). whose values are always 1).
labels -- a list of (x, y) pairs where x is either a token labels -- a list of (x, y) pairs where x is either a token
number or a symbol number, and y is either None number or a nonterminal number, and y is either None
or a string; the strings are keywords. The label or a string; the strings are keywords. The label
number is the index in this list; label numbers number is the index in this list; label numbers
are used to mark state transitions (arcs) in the are used to mark state transitions (arcs) in the
DFAs. DFAs.
start -- the number of the grammar's start symbol. start -- the number of the grammar's start nonterminal.
keywords -- a dict mapping keyword strings to arc labels. keywords -- a dict mapping keyword strings to arc labels.
@@ -65,29 +67,29 @@ class Grammar(object):
""" """
def __init__(self, bnf_text, start_symbol): def __init__(self, bnf_text, start_nonterminal):
self.symbol2number = {} self.nonterminal2number = {}
self.number2symbol = {} self.number2nonterminal = {}
self.states = [] self.states = []
self.dfas = {} self.dfas = {}
self.labels = [(0, "EMPTY")] self.labels = [(0, "EMPTY")]
self.keywords = {} self.keywords = {}
self.tokens = {} self.tokens = {}
self.symbol2label = {} self.nonterminal2label = {}
self.label2symbol = {} self.label2nonterminal = {}
self.start_symbol = start_symbol self.start_nonterminal = start_nonterminal
@property @property
def start(self): def start(self):
return self.symbol2number[self.start_symbol] return self.nonterminal2number[self.start_nonterminal]
def report(self): def report(self):
"""Dump the grammar tables to standard output, for debugging.""" """Dump the grammar tables to standard output, for debugging."""
from pprint import pprint from pprint import pprint
print("s2n") print("s2n")
pprint(self.symbol2number) pprint(self.nonterminal2number)
print("n2s") print("n2s")
pprint(self.number2symbol) pprint(self.number2nonterminal)
print("states") print("states")
pprint(self.states) pprint(self.states)
print("dfas") print("dfas")

View File

@@ -118,8 +118,8 @@ class PgenParser(object):
up. up.
A concrete syntax tree node is a (type, nodes) tuple, where A concrete syntax tree node is a (type, nodes) tuple, where
type is the node type (a token or symbol number) and nodes type is the node type (a token or nonterminal number) and nodes
is a list of children for symbols, and None for tokens. is a list of children for nonterminals, and None for tokens.
An abstract syntax tree node may be anything; this is entirely An abstract syntax tree node may be anything; this is entirely
up to the converter function. up to the converter function.
@@ -184,11 +184,11 @@ class PgenParser(object):
# Done with this token # Done with this token
return False return False
elif t >= 256: elif t >= 256:
# See if it's a symbol and if we're in its first set # See if it's a nonterminal and if we're in its first set
itsdfa = _gram.dfas[t] itsdfa = _gram.dfas[t]
itsstates, itsfirst = itsdfa itsstates, itsfirst = itsdfa
if ilabel in itsfirst: if ilabel in itsfirst:
# Push a symbol # Push a nonterminal
_push(t, itsdfa, newstate) _push(t, itsdfa, newstate)
break # To continue the outer while loop break # To continue the outer while loop
else: else:
@@ -231,7 +231,7 @@ class PgenParser(object):
try: try:
# Equal to: # Equal to:
# dfa, state, node = self.stack[-1] # dfa, state, node = self.stack[-1]
# symbol, children = node # nonterminal, children = node
self.stack[-1][2][1].append(newnode) self.stack[-1][2][1].append(newnode)
except IndexError: except IndexError:
# Stack is empty, set the rootnode. # Stack is empty, set the rootnode.

View File

@@ -29,7 +29,8 @@ class ParserGenerator(object):
self._nonterminal_to_dfas = rule_to_dfas self._nonterminal_to_dfas = rule_to_dfas
def make_grammar(self, grammar): def make_grammar(self, grammar):
self._first_terminals = {} # map from symbol name to set of tokens # Map from grammar rule (nonterminal) name to a set of tokens.
self._first_terminals = {}
names = list(self._nonterminal_to_dfas.keys()) names = list(self._nonterminal_to_dfas.keys())
names.sort() names.sort()
@@ -37,9 +38,9 @@ class ParserGenerator(object):
if name not in self._first_terminals: if name not in self._first_terminals:
self._calculate_first_terminals(name) self._calculate_first_terminals(name)
i = 256 + len(grammar.symbol2number) i = 256 + len(grammar.nonterminal2number)
grammar.symbol2number[name] = i grammar.nonterminal2number[name] = i
grammar.number2symbol[i] = name grammar.number2nonterminal[i] = name
# Now that we have calculated the first terminals, we are sure that # Now that we have calculated the first terminals, we are sure that
# there is no left recursion or ambiguities. # there is no left recursion or ambiguities.
@@ -55,7 +56,7 @@ class ParserGenerator(object):
arcs.append((0, dfas.index(state))) arcs.append((0, dfas.index(state)))
states.append(arcs) states.append(arcs)
grammar.states.append(states) grammar.states.append(states)
grammar.dfas[grammar.symbol2number[name]] = (states, self._make_first(grammar, name)) grammar.dfas[grammar.nonterminal2number[name]] = (states, self._make_first(grammar, name))
return grammar return grammar
def _make_first(self, grammar, name): def _make_first(self, grammar, name):
@@ -71,15 +72,15 @@ class ParserGenerator(object):
# XXX Maybe this should be a method on a subclass of converter? # XXX Maybe this should be a method on a subclass of converter?
ilabel = len(grammar.labels) ilabel = len(grammar.labels)
if label[0].isalpha(): if label[0].isalpha():
# Either a symbol name or a named token # Either a nonterminal name or a named token
if label in grammar.symbol2number: if label in grammar.nonterminal2number:
# A symbol name (a non-terminal) # A nonterminal name (a non-terminal)
if label in grammar.symbol2label: if label in grammar.nonterminal2label:
return grammar.symbol2label[label] return grammar.nonterminal2label[label]
else: else:
grammar.labels.append((grammar.symbol2number[label], None)) grammar.labels.append((grammar.nonterminal2number[label], None))
grammar.symbol2label[label] = ilabel grammar.nonterminal2label[label] = ilabel
grammar.label2symbol[ilabel] = label grammar.label2nonterminal[ilabel] = label
return ilabel return ilabel
else: else:
# A named token (NAME, NUMBER, STRING) # A named token (NAME, NUMBER, STRING)
@@ -293,7 +294,7 @@ def generate_grammar(bnf_grammar, token_namespace):
own parser. own parser.
""" """
rule_to_dfas = {} rule_to_dfas = {}
start_symbol = None start_nonterminal = None
for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse(): for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse():
#_dump_nfa(a, z) #_dump_nfa(a, z)
dfas = _make_dfas(nfa_a, nfa_z) dfas = _make_dfas(nfa_a, nfa_z)
@@ -304,8 +305,8 @@ def generate_grammar(bnf_grammar, token_namespace):
rule_to_dfas[nfa_a.from_rule] = dfas rule_to_dfas[nfa_a.from_rule] = dfas
#print(nfa_a.from_rule, oldlen, newlen) #print(nfa_a.from_rule, oldlen, newlen)
if start_symbol is None: if start_nonterminal is None:
start_symbol = nfa_a.from_rule start_nonterminal = nfa_a.from_rule
p = ParserGenerator(rule_to_dfas, token_namespace) p = ParserGenerator(rule_to_dfas, token_namespace)
return p.make_grammar(Grammar(bnf_grammar, start_symbol)) return p.make_grammar(Grammar(bnf_grammar, start_nonterminal))

View File

@@ -41,9 +41,9 @@ def _flows_finished(pgen_grammar, stack):
if, while, for and try might not be finished, because another part might if, while, for and try might not be finished, because another part might
still be parsed. still be parsed.
""" """
for dfa, newstate, (symbol_number, nodes) in stack: for dfa, newstate, (nonterminal_number, nodes) in stack:
if pgen_grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt', if pgen_grammar.number2nonterminal[nonterminal_number] \
'for_stmt', 'try_stmt'): in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'):
return False return False
return True return True
@@ -52,8 +52,8 @@ def suite_or_file_input_is_valid(pgen_grammar, stack):
if not _flows_finished(pgen_grammar, stack): if not _flows_finished(pgen_grammar, stack):
return False return False
for dfa, newstate, (symbol_number, nodes) in reversed(stack): for dfa, newstate, (nonterminal_number, nodes) in reversed(stack):
if pgen_grammar.number2symbol[symbol_number] == 'suite': if pgen_grammar.number2nonterminal[nonterminal_number] == 'suite':
# If only newline is in the suite, the suite is not valid, yet. # If only newline is in the suite, the suite is not valid, yet.
return len(nodes) > 1 return len(nodes) > 1
# Not reaching a suite means that we're dealing with file_input levels # Not reaching a suite means that we're dealing with file_input levels

View File

@@ -62,8 +62,8 @@ class Parser(BaseParser):
FSTRING_END: tree.FStringEnd, FSTRING_END: tree.FStringEnd,
} }
def __init__(self, pgen_grammar, error_recovery=True, start_symbol='file_input'): def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'):
super(Parser, self).__init__(pgen_grammar, start_symbol, error_recovery=error_recovery) super(Parser, self).__init__(pgen_grammar, start_nonterminal, error_recovery=error_recovery)
self.syntax_errors = [] self.syntax_errors = []
self._omit_dedent_list = [] self._omit_dedent_list = []
@@ -81,19 +81,19 @@ class Parser(BaseParser):
def parse(self, tokens): def parse(self, tokens):
if self._error_recovery: if self._error_recovery:
if self._start_symbol != 'file_input': if self._start_nonterminal != 'file_input':
raise NotImplementedError raise NotImplementedError
tokens = self._recovery_tokenize(tokens) tokens = self._recovery_tokenize(tokens)
node = super(Parser, self).parse(tokens) node = super(Parser, self).parse(tokens)
if self._start_symbol == 'file_input' != node.type: if self._start_nonterminal == 'file_input' != node.type:
# If there's only one statement, we get back a non-module. That's # If there's only one statement, we get back a non-module. That's
# not what we want, we want a module, so we add it here: # not what we want, we want a module, so we add it here:
node = self.convert_node( node = self.convert_node(
self._pgen_grammar, self._pgen_grammar,
self._pgen_grammar.symbol2number['file_input'], self._pgen_grammar.nonterminal2number['file_input'],
[node] [node]
) )
@@ -107,24 +107,24 @@ class Parser(BaseParser):
grammar rule produces a new complete node, so that the tree is build grammar rule produces a new complete node, so that the tree is build
strictly bottom-up. strictly bottom-up.
""" """
# TODO REMOVE symbol, we don't want type here. # TODO REMOVE nonterminal, we don't want type here.
symbol = pgen_grammar.number2symbol[type] nonterminal = pgen_grammar.number2nonterminal[type]
try: try:
return self.node_map[symbol](children) return self.node_map[nonterminal](children)
except KeyError: except KeyError:
if symbol == 'suite': if nonterminal == 'suite':
# We don't want the INDENT/DEDENT in our parser tree. Those # We don't want the INDENT/DEDENT in our parser tree. Those
# leaves are just cancer. They are virtual leaves and not real # leaves are just cancer. They are virtual leaves and not real
# ones and therefore have pseudo start/end positions and no # ones and therefore have pseudo start/end positions and no
# prefixes. Just ignore them. # prefixes. Just ignore them.
children = [children[0]] + children[2:-1] children = [children[0]] + children[2:-1]
elif symbol == 'list_if': elif nonterminal == 'list_if':
# Make transitioning from 2 to 3 easier. # Make transitioning from 2 to 3 easier.
symbol = 'comp_if' nonterminal = 'comp_if'
elif symbol == 'listmaker': elif nonterminal == 'listmaker':
# Same as list_if above. # Same as list_if above.
symbol = 'testlist_comp' nonterminal = 'testlist_comp'
return self.default_node(symbol, children) return self.default_node(nonterminal, children)
def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos): def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
# print('leaf', repr(value), token.tok_name[type]) # print('leaf', repr(value), token.tok_name[type])
@@ -138,10 +138,10 @@ class Parser(BaseParser):
def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix, def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
add_token_callback): add_token_callback):
def get_symbol_and_nodes(stack): def get_nonterminal_and_nodes(stack):
for dfa, state, (type_, nodes) in stack: for dfa, state, (type_, nodes) in stack:
symbol = pgen_grammar.number2symbol[type_] nonterminal = pgen_grammar.number2nonterminal[type_]
yield symbol, nodes yield nonterminal, nodes
tos_nodes = stack.get_tos_nodes() tos_nodes = stack.get_tos_nodes()
if tos_nodes: if tos_nodes:
@@ -149,7 +149,7 @@ class Parser(BaseParser):
else: else:
last_leaf = None last_leaf = None
if self._start_symbol == 'file_input' and \ if self._start_nonterminal == 'file_input' and \
(typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value): (typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value):
def reduce_stack(states, newstate): def reduce_stack(states, newstate):
# reduce # reduce
@@ -168,13 +168,13 @@ class Parser(BaseParser):
ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value) ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value)
dfa, state, (type_, nodes) = stack[-1] dfa, state, (type_, nodes) = stack[-1]
symbol = pgen_grammar.number2symbol[type_] nonterminal = pgen_grammar.number2nonterminal[type_]
states, first = dfa states, first = dfa
arcs = states[state] arcs = states[state]
# Look for a state with this label # Look for a state with this label
for i, newstate in arcs: for i, newstate in arcs:
if ilabel == i: if ilabel == i:
if symbol == 'simple_stmt': if nonterminal == 'simple_stmt':
# This is basically shifting # This is basically shifting
stack[-1] = (dfa, newstate, (type_, nodes)) stack[-1] = (dfa, newstate, (type_, nodes))
@@ -182,12 +182,12 @@ class Parser(BaseParser):
add_token_callback(typ, value, start_pos, prefix) add_token_callback(typ, value, start_pos, prefix)
return return
# Check if we're at the right point # Check if we're at the right point
#for symbol, nodes in get_symbol_and_nodes(stack): #for nonterminal, nodes in get_nonterminal_and_nodes(stack):
# self.pgen_parser._pop() # self.pgen_parser._pop()
#break #break
break break
#symbol = pgen_grammar.number2symbol[type_] #nonterminal = pgen_grammar.number2nonterminal[type_]
if not self._error_recovery: if not self._error_recovery:
return super(Parser, self).error_recovery( return super(Parser, self).error_recovery(
@@ -198,21 +198,21 @@ class Parser(BaseParser):
# For now just discard everything that is not a suite or # For now just discard everything that is not a suite or
# file_input, if we detect an error. # file_input, if we detect an error.
one_line_suite = False one_line_suite = False
for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))): for index, (nonterminal, nodes) in reversed(list(enumerate(get_nonterminal_and_nodes(stack)))):
# `suite` can sometimes be only simple_stmt, not stmt. # `suite` can sometimes be only simple_stmt, not stmt.
if one_line_suite: if one_line_suite:
break break
elif symbol == 'file_input': elif nonterminal == 'file_input':
break break
elif symbol == 'suite': elif nonterminal == 'suite':
if len(nodes) > 1: if len(nodes) > 1:
break break
elif not nodes: elif not nodes:
one_line_suite = True one_line_suite = True
# `suite` without an indent are error nodes. # `suite` without an indent are error nodes.
return index, symbol, nodes return index, nonterminal, nodes
index, symbol, nodes = current_suite(stack) index, nonterminal, nodes = current_suite(stack)
# print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index) # print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos): if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos):
@@ -226,11 +226,11 @@ class Parser(BaseParser):
error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix) error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix)
stack[-1][2][1].append(error_leaf) stack[-1][2][1].append(error_leaf)
if symbol == 'suite': if nonterminal == 'suite':
dfa, state, node = stack[-1] dfa, state, node = stack[-1]
states, first = dfa states, first = dfa
arcs = states[state] arcs = states[state]
intended_label = pgen_grammar.symbol2label['stmt'] intended_label = pgen_grammar.nonterminal2label['stmt']
# Introduce a proper state transition. We're basically allowing # Introduce a proper state transition. We're basically allowing
# there to be no valid statements inside a suite. # there to be no valid statements inside a suite.
if [x[0] for x in arcs] == [intended_label]: if [x[0] for x in arcs] == [intended_label]: