Try to completely remove the word symbol and use nonterminal

The ones that we could not remove are in grammar.py, because that's the public documented API.
This commit is contained in:
Dave Halter
2018-06-17 18:30:20 +02:00
parent 640f544af9
commit 73ce57428b
7 changed files with 89 additions and 86 deletions

View File

@@ -51,8 +51,8 @@ class Grammar(object):
it is invalid, it will be returned as an error node. If disabled,
you will get a ParseError when encountering syntax errors in your
code.
:param str start_symbol: The grammar symbol that you want to parse. Only
allowed to be used when error_recovery is False.
:param str start_symbol: The grammar rule (nonterminal) that you want
to parse. Only allowed to be used when error_recovery is False.
:param str path: The path to the file you want to open. Only needed for caching.
:param bool cache: Keeps a copy of the parser tree in RAM and on disk
if a path is given. Returns the cached trees if the corresponding
@@ -88,7 +88,7 @@ class Grammar(object):
raise TypeError("Please provide either code or a path.")
if start_symbol is None:
start_symbol = self._start_symbol
start_symbol = self._start_nonterminal
if error_recovery and start_symbol != 'file_input':
raise NotImplementedError("This is currently not implemented.")
@@ -136,7 +136,7 @@ class Grammar(object):
p = self._parser(
self._pgen_grammar,
error_recovery=error_recovery,
start_symbol=start_symbol
start_nonterminal=start_symbol
)
root_node = p.parse(tokens=tokens)
@@ -186,7 +186,7 @@ class Grammar(object):
return normalizer.issues
def __repr__(self):
labels = self._pgen_grammar.number2symbol.values()
labels = self._pgen_grammar.number2nonterminal.values()
txt = ' '.join(list(labels)[:3]) + ' ...'
return '<%s:%s>' % (self.__class__.__name__, txt)
@@ -194,7 +194,7 @@ class Grammar(object):
class PythonGrammar(Grammar):
_error_normalizer_config = ErrorFinderConfig()
_token_namespace = token
_start_symbol = 'file_input'
_start_nonterminal = 'file_input'
def __init__(self, version_info, bnf_text):
super(PythonGrammar, self).__init__(

View File

@@ -38,13 +38,13 @@ class BaseParser(object):
}
default_leaf = tree.Leaf
def __init__(self, pgen_grammar, start_symbol='file_input', error_recovery=False):
def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False):
self._pgen_grammar = pgen_grammar
self._start_symbol = start_symbol
self._start_nonterminal = start_nonterminal
self._error_recovery = error_recovery
def parse(self, tokens):
start_number = self._pgen_grammar.symbol2number[self._start_symbol]
start_number = self._pgen_grammar.nonterminal2number[self._start_nonterminal]
self.pgen_parser = PgenParser(
self._pgen_grammar, self.convert_node, self.convert_leaf,
self.error_recovery, start_number
@@ -64,12 +64,12 @@ class BaseParser(object):
raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf)
def convert_node(self, pgen_grammar, type_, children):
# TODO REMOVE symbol, we don't want type here.
symbol = pgen_grammar.number2symbol[type_]
# TODO REMOVE nonterminal, we don't want type here.
nonterminal = pgen_grammar.number2nonterminal[type_]
try:
return self.node_map[symbol](children)
return self.node_map[nonterminal](children)
except KeyError:
return self.default_node(symbol, children)
return self.default_node(nonterminal, children)
def convert_leaf(self, pgen_grammar, type_, value, prefix, start_pos):
try:

View File

@@ -28,12 +28,14 @@ class Grammar(object):
The instance variables are as follows:
symbol2number -- a dict mapping symbol names to numbers. Symbol
numbers are always 256 or higher, to distinguish
them from token numbers, which are between 0 and
255 (inclusive).
nonterminal2number --
A dict mapping nonterminal names to numbers.
Nonterminal numbers are always 256 or higher, to
distinguish them from token numbers, which are between 0
and 255 (inclusive).
number2symbol -- a dict mapping numbers to symbol names;
number2nonterminal --
A dict mapping numbers to nonterminal names;
these two are each other's inverse.
states -- a list of DFAs, where each DFA is a list of
@@ -44,20 +46,20 @@ class Grammar(object):
Final states are represented by a special arc of
the form (0, j) where j is its own state number.
dfas -- a dict mapping symbol numbers to (DFA, first)
dfas -- a dict mapping nonterminal numbers to (DFA, first)
pairs, where DFA is an item from the states list
above, and first is a set of tokens that can
begin this grammar rule (represented by a dict
whose values are always 1).
labels -- a list of (x, y) pairs where x is either a token
number or a symbol number, and y is either None
number or a nonterminal number, and y is either None
or a string; the strings are keywords. The label
number is the index in this list; label numbers
are used to mark state transitions (arcs) in the
DFAs.
start -- the number of the grammar's start symbol.
start -- the number of the grammar's start nonterminal.
keywords -- a dict mapping keyword strings to arc labels.
@@ -65,29 +67,29 @@ class Grammar(object):
"""
def __init__(self, bnf_text, start_symbol):
self.symbol2number = {}
self.number2symbol = {}
def __init__(self, bnf_text, start_nonterminal):
self.nonterminal2number = {}
self.number2nonterminal = {}
self.states = []
self.dfas = {}
self.labels = [(0, "EMPTY")]
self.keywords = {}
self.tokens = {}
self.symbol2label = {}
self.label2symbol = {}
self.start_symbol = start_symbol
self.nonterminal2label = {}
self.label2nonterminal = {}
self.start_nonterminal = start_nonterminal
@property
def start(self):
return self.symbol2number[self.start_symbol]
return self.nonterminal2number[self.start_nonterminal]
def report(self):
"""Dump the grammar tables to standard output, for debugging."""
from pprint import pprint
print("s2n")
pprint(self.symbol2number)
pprint(self.nonterminal2number)
print("n2s")
pprint(self.number2symbol)
pprint(self.number2nonterminal)
print("states")
pprint(self.states)
print("dfas")

View File

@@ -118,8 +118,8 @@ class PgenParser(object):
up.
A concrete syntax tree node is a (type, nodes) tuple, where
type is the node type (a token or symbol number) and nodes
is a list of children for symbols, and None for tokens.
type is the node type (a token or nonterminal number) and nodes
is a list of children for nonterminals, and None for tokens.
An abstract syntax tree node may be anything; this is entirely
up to the converter function.
@@ -184,11 +184,11 @@ class PgenParser(object):
# Done with this token
return False
elif t >= 256:
# See if it's a symbol and if we're in its first set
# See if it's a nonterminal and if we're in its first set
itsdfa = _gram.dfas[t]
itsstates, itsfirst = itsdfa
if ilabel in itsfirst:
# Push a symbol
# Push a nonterminal
_push(t, itsdfa, newstate)
break # To continue the outer while loop
else:
@@ -231,7 +231,7 @@ class PgenParser(object):
try:
# Equal to:
# dfa, state, node = self.stack[-1]
# symbol, children = node
# nonterminal, children = node
self.stack[-1][2][1].append(newnode)
except IndexError:
# Stack is empty, set the rootnode.

View File

@@ -29,7 +29,8 @@ class ParserGenerator(object):
self._nonterminal_to_dfas = rule_to_dfas
def make_grammar(self, grammar):
self._first_terminals = {} # map from symbol name to set of tokens
# Map from grammar rule (nonterminal) name to a set of tokens.
self._first_terminals = {}
names = list(self._nonterminal_to_dfas.keys())
names.sort()
@@ -37,9 +38,9 @@ class ParserGenerator(object):
if name not in self._first_terminals:
self._calculate_first_terminals(name)
i = 256 + len(grammar.symbol2number)
grammar.symbol2number[name] = i
grammar.number2symbol[i] = name
i = 256 + len(grammar.nonterminal2number)
grammar.nonterminal2number[name] = i
grammar.number2nonterminal[i] = name
# Now that we have calculated the first terminals, we are sure that
# there is no left recursion or ambiguities.
@@ -55,7 +56,7 @@ class ParserGenerator(object):
arcs.append((0, dfas.index(state)))
states.append(arcs)
grammar.states.append(states)
grammar.dfas[grammar.symbol2number[name]] = (states, self._make_first(grammar, name))
grammar.dfas[grammar.nonterminal2number[name]] = (states, self._make_first(grammar, name))
return grammar
def _make_first(self, grammar, name):
@@ -71,15 +72,15 @@ class ParserGenerator(object):
# XXX Maybe this should be a method on a subclass of converter?
ilabel = len(grammar.labels)
if label[0].isalpha():
# Either a symbol name or a named token
if label in grammar.symbol2number:
# A symbol name (a non-terminal)
if label in grammar.symbol2label:
return grammar.symbol2label[label]
# Either a nonterminal name or a named token
if label in grammar.nonterminal2number:
# A nonterminal name (a non-terminal)
if label in grammar.nonterminal2label:
return grammar.nonterminal2label[label]
else:
grammar.labels.append((grammar.symbol2number[label], None))
grammar.symbol2label[label] = ilabel
grammar.label2symbol[ilabel] = label
grammar.labels.append((grammar.nonterminal2number[label], None))
grammar.nonterminal2label[label] = ilabel
grammar.label2nonterminal[ilabel] = label
return ilabel
else:
# A named token (NAME, NUMBER, STRING)
@@ -293,7 +294,7 @@ def generate_grammar(bnf_grammar, token_namespace):
own parser.
"""
rule_to_dfas = {}
start_symbol = None
start_nonterminal = None
for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse():
#_dump_nfa(a, z)
dfas = _make_dfas(nfa_a, nfa_z)
@@ -304,8 +305,8 @@ def generate_grammar(bnf_grammar, token_namespace):
rule_to_dfas[nfa_a.from_rule] = dfas
#print(nfa_a.from_rule, oldlen, newlen)
if start_symbol is None:
start_symbol = nfa_a.from_rule
if start_nonterminal is None:
start_nonterminal = nfa_a.from_rule
p = ParserGenerator(rule_to_dfas, token_namespace)
return p.make_grammar(Grammar(bnf_grammar, start_symbol))
return p.make_grammar(Grammar(bnf_grammar, start_nonterminal))

View File

@@ -41,9 +41,9 @@ def _flows_finished(pgen_grammar, stack):
if, while, for and try might not be finished, because another part might
still be parsed.
"""
for dfa, newstate, (symbol_number, nodes) in stack:
if pgen_grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt',
'for_stmt', 'try_stmt'):
for dfa, newstate, (nonterminal_number, nodes) in stack:
if pgen_grammar.number2nonterminal[nonterminal_number] \
in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'):
return False
return True
@@ -52,8 +52,8 @@ def suite_or_file_input_is_valid(pgen_grammar, stack):
if not _flows_finished(pgen_grammar, stack):
return False
for dfa, newstate, (symbol_number, nodes) in reversed(stack):
if pgen_grammar.number2symbol[symbol_number] == 'suite':
for dfa, newstate, (nonterminal_number, nodes) in reversed(stack):
if pgen_grammar.number2nonterminal[nonterminal_number] == 'suite':
# If only newline is in the suite, the suite is not valid, yet.
return len(nodes) > 1
# Not reaching a suite means that we're dealing with file_input levels

View File

@@ -62,8 +62,8 @@ class Parser(BaseParser):
FSTRING_END: tree.FStringEnd,
}
def __init__(self, pgen_grammar, error_recovery=True, start_symbol='file_input'):
super(Parser, self).__init__(pgen_grammar, start_symbol, error_recovery=error_recovery)
def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'):
super(Parser, self).__init__(pgen_grammar, start_nonterminal, error_recovery=error_recovery)
self.syntax_errors = []
self._omit_dedent_list = []
@@ -81,19 +81,19 @@ class Parser(BaseParser):
def parse(self, tokens):
if self._error_recovery:
if self._start_symbol != 'file_input':
if self._start_nonterminal != 'file_input':
raise NotImplementedError
tokens = self._recovery_tokenize(tokens)
node = super(Parser, self).parse(tokens)
if self._start_symbol == 'file_input' != node.type:
if self._start_nonterminal == 'file_input' != node.type:
# If there's only one statement, we get back a non-module. That's
# not what we want, we want a module, so we add it here:
node = self.convert_node(
self._pgen_grammar,
self._pgen_grammar.symbol2number['file_input'],
self._pgen_grammar.nonterminal2number['file_input'],
[node]
)
@@ -107,24 +107,24 @@ class Parser(BaseParser):
grammar rule produces a new complete node, so that the tree is build
strictly bottom-up.
"""
# TODO REMOVE symbol, we don't want type here.
symbol = pgen_grammar.number2symbol[type]
# TODO REMOVE nonterminal, we don't want type here.
nonterminal = pgen_grammar.number2nonterminal[type]
try:
return self.node_map[symbol](children)
return self.node_map[nonterminal](children)
except KeyError:
if symbol == 'suite':
if nonterminal == 'suite':
# We don't want the INDENT/DEDENT in our parser tree. Those
# leaves are just cancer. They are virtual leaves and not real
# ones and therefore have pseudo start/end positions and no
# prefixes. Just ignore them.
children = [children[0]] + children[2:-1]
elif symbol == 'list_if':
elif nonterminal == 'list_if':
# Make transitioning from 2 to 3 easier.
symbol = 'comp_if'
elif symbol == 'listmaker':
nonterminal = 'comp_if'
elif nonterminal == 'listmaker':
# Same as list_if above.
symbol = 'testlist_comp'
return self.default_node(symbol, children)
nonterminal = 'testlist_comp'
return self.default_node(nonterminal, children)
def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
# print('leaf', repr(value), token.tok_name[type])
@@ -138,10 +138,10 @@ class Parser(BaseParser):
def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
add_token_callback):
def get_symbol_and_nodes(stack):
def get_nonterminal_and_nodes(stack):
for dfa, state, (type_, nodes) in stack:
symbol = pgen_grammar.number2symbol[type_]
yield symbol, nodes
nonterminal = pgen_grammar.number2nonterminal[type_]
yield nonterminal, nodes
tos_nodes = stack.get_tos_nodes()
if tos_nodes:
@@ -149,7 +149,7 @@ class Parser(BaseParser):
else:
last_leaf = None
if self._start_symbol == 'file_input' and \
if self._start_nonterminal == 'file_input' and \
(typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value):
def reduce_stack(states, newstate):
# reduce
@@ -168,13 +168,13 @@ class Parser(BaseParser):
ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value)
dfa, state, (type_, nodes) = stack[-1]
symbol = pgen_grammar.number2symbol[type_]
nonterminal = pgen_grammar.number2nonterminal[type_]
states, first = dfa
arcs = states[state]
# Look for a state with this label
for i, newstate in arcs:
if ilabel == i:
if symbol == 'simple_stmt':
if nonterminal == 'simple_stmt':
# This is basically shifting
stack[-1] = (dfa, newstate, (type_, nodes))
@@ -182,12 +182,12 @@ class Parser(BaseParser):
add_token_callback(typ, value, start_pos, prefix)
return
# Check if we're at the right point
#for symbol, nodes in get_symbol_and_nodes(stack):
#for nonterminal, nodes in get_nonterminal_and_nodes(stack):
# self.pgen_parser._pop()
#break
break
#symbol = pgen_grammar.number2symbol[type_]
#nonterminal = pgen_grammar.number2nonterminal[type_]
if not self._error_recovery:
return super(Parser, self).error_recovery(
@@ -198,21 +198,21 @@ class Parser(BaseParser):
# For now just discard everything that is not a suite or
# file_input, if we detect an error.
one_line_suite = False
for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))):
for index, (nonterminal, nodes) in reversed(list(enumerate(get_nonterminal_and_nodes(stack)))):
# `suite` can sometimes be only simple_stmt, not stmt.
if one_line_suite:
break
elif symbol == 'file_input':
elif nonterminal == 'file_input':
break
elif symbol == 'suite':
elif nonterminal == 'suite':
if len(nodes) > 1:
break
elif not nodes:
one_line_suite = True
# `suite` without an indent are error nodes.
return index, symbol, nodes
return index, nonterminal, nodes
index, symbol, nodes = current_suite(stack)
index, nonterminal, nodes = current_suite(stack)
# print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos):
@@ -226,11 +226,11 @@ class Parser(BaseParser):
error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix)
stack[-1][2][1].append(error_leaf)
if symbol == 'suite':
if nonterminal == 'suite':
dfa, state, node = stack[-1]
states, first = dfa
arcs = states[state]
intended_label = pgen_grammar.symbol2label['stmt']
intended_label = pgen_grammar.nonterminal2label['stmt']
# Introduce a proper state transition. We're basically allowing
# there to be no valid statements inside a suite.
if [x[0] for x in arcs] == [intended_label]: