mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-09 14:14:53 +08:00
Try to completely remove the word symbol and use nonterminal
The ones that we could not remove are in grammar.py, because that's the public documented API.
This commit is contained in:
@@ -51,8 +51,8 @@ class Grammar(object):
|
||||
it is invalid, it will be returned as an error node. If disabled,
|
||||
you will get a ParseError when encountering syntax errors in your
|
||||
code.
|
||||
:param str start_symbol: The grammar symbol that you want to parse. Only
|
||||
allowed to be used when error_recovery is False.
|
||||
:param str start_symbol: The grammar rule (nonterminal) that you want
|
||||
to parse. Only allowed to be used when error_recovery is False.
|
||||
:param str path: The path to the file you want to open. Only needed for caching.
|
||||
:param bool cache: Keeps a copy of the parser tree in RAM and on disk
|
||||
if a path is given. Returns the cached trees if the corresponding
|
||||
@@ -88,7 +88,7 @@ class Grammar(object):
|
||||
raise TypeError("Please provide either code or a path.")
|
||||
|
||||
if start_symbol is None:
|
||||
start_symbol = self._start_symbol
|
||||
start_symbol = self._start_nonterminal
|
||||
|
||||
if error_recovery and start_symbol != 'file_input':
|
||||
raise NotImplementedError("This is currently not implemented.")
|
||||
@@ -136,7 +136,7 @@ class Grammar(object):
|
||||
p = self._parser(
|
||||
self._pgen_grammar,
|
||||
error_recovery=error_recovery,
|
||||
start_symbol=start_symbol
|
||||
start_nonterminal=start_symbol
|
||||
)
|
||||
root_node = p.parse(tokens=tokens)
|
||||
|
||||
@@ -186,7 +186,7 @@ class Grammar(object):
|
||||
return normalizer.issues
|
||||
|
||||
def __repr__(self):
|
||||
labels = self._pgen_grammar.number2symbol.values()
|
||||
labels = self._pgen_grammar.number2nonterminal.values()
|
||||
txt = ' '.join(list(labels)[:3]) + ' ...'
|
||||
return '<%s:%s>' % (self.__class__.__name__, txt)
|
||||
|
||||
@@ -194,7 +194,7 @@ class Grammar(object):
|
||||
class PythonGrammar(Grammar):
|
||||
_error_normalizer_config = ErrorFinderConfig()
|
||||
_token_namespace = token
|
||||
_start_symbol = 'file_input'
|
||||
_start_nonterminal = 'file_input'
|
||||
|
||||
def __init__(self, version_info, bnf_text):
|
||||
super(PythonGrammar, self).__init__(
|
||||
|
||||
@@ -38,13 +38,13 @@ class BaseParser(object):
|
||||
}
|
||||
default_leaf = tree.Leaf
|
||||
|
||||
def __init__(self, pgen_grammar, start_symbol='file_input', error_recovery=False):
|
||||
def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False):
|
||||
self._pgen_grammar = pgen_grammar
|
||||
self._start_symbol = start_symbol
|
||||
self._start_nonterminal = start_nonterminal
|
||||
self._error_recovery = error_recovery
|
||||
|
||||
def parse(self, tokens):
|
||||
start_number = self._pgen_grammar.symbol2number[self._start_symbol]
|
||||
start_number = self._pgen_grammar.nonterminal2number[self._start_nonterminal]
|
||||
self.pgen_parser = PgenParser(
|
||||
self._pgen_grammar, self.convert_node, self.convert_leaf,
|
||||
self.error_recovery, start_number
|
||||
@@ -64,12 +64,12 @@ class BaseParser(object):
|
||||
raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf)
|
||||
|
||||
def convert_node(self, pgen_grammar, type_, children):
|
||||
# TODO REMOVE symbol, we don't want type here.
|
||||
symbol = pgen_grammar.number2symbol[type_]
|
||||
# TODO REMOVE nonterminal, we don't want type here.
|
||||
nonterminal = pgen_grammar.number2nonterminal[type_]
|
||||
try:
|
||||
return self.node_map[symbol](children)
|
||||
return self.node_map[nonterminal](children)
|
||||
except KeyError:
|
||||
return self.default_node(symbol, children)
|
||||
return self.default_node(nonterminal, children)
|
||||
|
||||
def convert_leaf(self, pgen_grammar, type_, value, prefix, start_pos):
|
||||
try:
|
||||
|
||||
@@ -28,12 +28,14 @@ class Grammar(object):
|
||||
|
||||
The instance variables are as follows:
|
||||
|
||||
symbol2number -- a dict mapping symbol names to numbers. Symbol
|
||||
numbers are always 256 or higher, to distinguish
|
||||
them from token numbers, which are between 0 and
|
||||
255 (inclusive).
|
||||
nonterminal2number --
|
||||
A dict mapping nonterminal names to numbers.
|
||||
Nonterminal numbers are always 256 or higher, to
|
||||
distinguish them from token numbers, which are between 0
|
||||
and 255 (inclusive).
|
||||
|
||||
number2symbol -- a dict mapping numbers to symbol names;
|
||||
number2nonterminal --
|
||||
A dict mapping numbers to nonterminal names;
|
||||
these two are each other's inverse.
|
||||
|
||||
states -- a list of DFAs, where each DFA is a list of
|
||||
@@ -44,20 +46,20 @@ class Grammar(object):
|
||||
Final states are represented by a special arc of
|
||||
the form (0, j) where j is its own state number.
|
||||
|
||||
dfas -- a dict mapping symbol numbers to (DFA, first)
|
||||
dfas -- a dict mapping nonterminal numbers to (DFA, first)
|
||||
pairs, where DFA is an item from the states list
|
||||
above, and first is a set of tokens that can
|
||||
begin this grammar rule (represented by a dict
|
||||
whose values are always 1).
|
||||
|
||||
labels -- a list of (x, y) pairs where x is either a token
|
||||
number or a symbol number, and y is either None
|
||||
number or a nonterminal number, and y is either None
|
||||
or a string; the strings are keywords. The label
|
||||
number is the index in this list; label numbers
|
||||
are used to mark state transitions (arcs) in the
|
||||
DFAs.
|
||||
|
||||
start -- the number of the grammar's start symbol.
|
||||
start -- the number of the grammar's start nonterminal.
|
||||
|
||||
keywords -- a dict mapping keyword strings to arc labels.
|
||||
|
||||
@@ -65,29 +67,29 @@ class Grammar(object):
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, bnf_text, start_symbol):
|
||||
self.symbol2number = {}
|
||||
self.number2symbol = {}
|
||||
def __init__(self, bnf_text, start_nonterminal):
|
||||
self.nonterminal2number = {}
|
||||
self.number2nonterminal = {}
|
||||
self.states = []
|
||||
self.dfas = {}
|
||||
self.labels = [(0, "EMPTY")]
|
||||
self.keywords = {}
|
||||
self.tokens = {}
|
||||
self.symbol2label = {}
|
||||
self.label2symbol = {}
|
||||
self.start_symbol = start_symbol
|
||||
self.nonterminal2label = {}
|
||||
self.label2nonterminal = {}
|
||||
self.start_nonterminal = start_nonterminal
|
||||
|
||||
@property
|
||||
def start(self):
|
||||
return self.symbol2number[self.start_symbol]
|
||||
return self.nonterminal2number[self.start_nonterminal]
|
||||
|
||||
def report(self):
|
||||
"""Dump the grammar tables to standard output, for debugging."""
|
||||
from pprint import pprint
|
||||
print("s2n")
|
||||
pprint(self.symbol2number)
|
||||
pprint(self.nonterminal2number)
|
||||
print("n2s")
|
||||
pprint(self.number2symbol)
|
||||
pprint(self.number2nonterminal)
|
||||
print("states")
|
||||
pprint(self.states)
|
||||
print("dfas")
|
||||
|
||||
@@ -118,8 +118,8 @@ class PgenParser(object):
|
||||
up.
|
||||
|
||||
A concrete syntax tree node is a (type, nodes) tuple, where
|
||||
type is the node type (a token or symbol number) and nodes
|
||||
is a list of children for symbols, and None for tokens.
|
||||
type is the node type (a token or nonterminal number) and nodes
|
||||
is a list of children for nonterminals, and None for tokens.
|
||||
|
||||
An abstract syntax tree node may be anything; this is entirely
|
||||
up to the converter function.
|
||||
@@ -184,11 +184,11 @@ class PgenParser(object):
|
||||
# Done with this token
|
||||
return False
|
||||
elif t >= 256:
|
||||
# See if it's a symbol and if we're in its first set
|
||||
# See if it's a nonterminal and if we're in its first set
|
||||
itsdfa = _gram.dfas[t]
|
||||
itsstates, itsfirst = itsdfa
|
||||
if ilabel in itsfirst:
|
||||
# Push a symbol
|
||||
# Push a nonterminal
|
||||
_push(t, itsdfa, newstate)
|
||||
break # To continue the outer while loop
|
||||
else:
|
||||
@@ -231,7 +231,7 @@ class PgenParser(object):
|
||||
try:
|
||||
# Equal to:
|
||||
# dfa, state, node = self.stack[-1]
|
||||
# symbol, children = node
|
||||
# nonterminal, children = node
|
||||
self.stack[-1][2][1].append(newnode)
|
||||
except IndexError:
|
||||
# Stack is empty, set the rootnode.
|
||||
|
||||
@@ -29,7 +29,8 @@ class ParserGenerator(object):
|
||||
self._nonterminal_to_dfas = rule_to_dfas
|
||||
|
||||
def make_grammar(self, grammar):
|
||||
self._first_terminals = {} # map from symbol name to set of tokens
|
||||
# Map from grammar rule (nonterminal) name to a set of tokens.
|
||||
self._first_terminals = {}
|
||||
|
||||
names = list(self._nonterminal_to_dfas.keys())
|
||||
names.sort()
|
||||
@@ -37,9 +38,9 @@ class ParserGenerator(object):
|
||||
if name not in self._first_terminals:
|
||||
self._calculate_first_terminals(name)
|
||||
|
||||
i = 256 + len(grammar.symbol2number)
|
||||
grammar.symbol2number[name] = i
|
||||
grammar.number2symbol[i] = name
|
||||
i = 256 + len(grammar.nonterminal2number)
|
||||
grammar.nonterminal2number[name] = i
|
||||
grammar.number2nonterminal[i] = name
|
||||
|
||||
# Now that we have calculated the first terminals, we are sure that
|
||||
# there is no left recursion or ambiguities.
|
||||
@@ -55,7 +56,7 @@ class ParserGenerator(object):
|
||||
arcs.append((0, dfas.index(state)))
|
||||
states.append(arcs)
|
||||
grammar.states.append(states)
|
||||
grammar.dfas[grammar.symbol2number[name]] = (states, self._make_first(grammar, name))
|
||||
grammar.dfas[grammar.nonterminal2number[name]] = (states, self._make_first(grammar, name))
|
||||
return grammar
|
||||
|
||||
def _make_first(self, grammar, name):
|
||||
@@ -71,15 +72,15 @@ class ParserGenerator(object):
|
||||
# XXX Maybe this should be a method on a subclass of converter?
|
||||
ilabel = len(grammar.labels)
|
||||
if label[0].isalpha():
|
||||
# Either a symbol name or a named token
|
||||
if label in grammar.symbol2number:
|
||||
# A symbol name (a non-terminal)
|
||||
if label in grammar.symbol2label:
|
||||
return grammar.symbol2label[label]
|
||||
# Either a nonterminal name or a named token
|
||||
if label in grammar.nonterminal2number:
|
||||
# A nonterminal name (a non-terminal)
|
||||
if label in grammar.nonterminal2label:
|
||||
return grammar.nonterminal2label[label]
|
||||
else:
|
||||
grammar.labels.append((grammar.symbol2number[label], None))
|
||||
grammar.symbol2label[label] = ilabel
|
||||
grammar.label2symbol[ilabel] = label
|
||||
grammar.labels.append((grammar.nonterminal2number[label], None))
|
||||
grammar.nonterminal2label[label] = ilabel
|
||||
grammar.label2nonterminal[ilabel] = label
|
||||
return ilabel
|
||||
else:
|
||||
# A named token (NAME, NUMBER, STRING)
|
||||
@@ -293,7 +294,7 @@ def generate_grammar(bnf_grammar, token_namespace):
|
||||
own parser.
|
||||
"""
|
||||
rule_to_dfas = {}
|
||||
start_symbol = None
|
||||
start_nonterminal = None
|
||||
for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse():
|
||||
#_dump_nfa(a, z)
|
||||
dfas = _make_dfas(nfa_a, nfa_z)
|
||||
@@ -304,8 +305,8 @@ def generate_grammar(bnf_grammar, token_namespace):
|
||||
rule_to_dfas[nfa_a.from_rule] = dfas
|
||||
#print(nfa_a.from_rule, oldlen, newlen)
|
||||
|
||||
if start_symbol is None:
|
||||
start_symbol = nfa_a.from_rule
|
||||
if start_nonterminal is None:
|
||||
start_nonterminal = nfa_a.from_rule
|
||||
|
||||
p = ParserGenerator(rule_to_dfas, token_namespace)
|
||||
return p.make_grammar(Grammar(bnf_grammar, start_symbol))
|
||||
return p.make_grammar(Grammar(bnf_grammar, start_nonterminal))
|
||||
|
||||
@@ -41,9 +41,9 @@ def _flows_finished(pgen_grammar, stack):
|
||||
if, while, for and try might not be finished, because another part might
|
||||
still be parsed.
|
||||
"""
|
||||
for dfa, newstate, (symbol_number, nodes) in stack:
|
||||
if pgen_grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt',
|
||||
'for_stmt', 'try_stmt'):
|
||||
for dfa, newstate, (nonterminal_number, nodes) in stack:
|
||||
if pgen_grammar.number2nonterminal[nonterminal_number] \
|
||||
in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'):
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -52,8 +52,8 @@ def suite_or_file_input_is_valid(pgen_grammar, stack):
|
||||
if not _flows_finished(pgen_grammar, stack):
|
||||
return False
|
||||
|
||||
for dfa, newstate, (symbol_number, nodes) in reversed(stack):
|
||||
if pgen_grammar.number2symbol[symbol_number] == 'suite':
|
||||
for dfa, newstate, (nonterminal_number, nodes) in reversed(stack):
|
||||
if pgen_grammar.number2nonterminal[nonterminal_number] == 'suite':
|
||||
# If only newline is in the suite, the suite is not valid, yet.
|
||||
return len(nodes) > 1
|
||||
# Not reaching a suite means that we're dealing with file_input levels
|
||||
|
||||
@@ -62,8 +62,8 @@ class Parser(BaseParser):
|
||||
FSTRING_END: tree.FStringEnd,
|
||||
}
|
||||
|
||||
def __init__(self, pgen_grammar, error_recovery=True, start_symbol='file_input'):
|
||||
super(Parser, self).__init__(pgen_grammar, start_symbol, error_recovery=error_recovery)
|
||||
def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'):
|
||||
super(Parser, self).__init__(pgen_grammar, start_nonterminal, error_recovery=error_recovery)
|
||||
|
||||
self.syntax_errors = []
|
||||
self._omit_dedent_list = []
|
||||
@@ -81,19 +81,19 @@ class Parser(BaseParser):
|
||||
|
||||
def parse(self, tokens):
|
||||
if self._error_recovery:
|
||||
if self._start_symbol != 'file_input':
|
||||
if self._start_nonterminal != 'file_input':
|
||||
raise NotImplementedError
|
||||
|
||||
tokens = self._recovery_tokenize(tokens)
|
||||
|
||||
node = super(Parser, self).parse(tokens)
|
||||
|
||||
if self._start_symbol == 'file_input' != node.type:
|
||||
if self._start_nonterminal == 'file_input' != node.type:
|
||||
# If there's only one statement, we get back a non-module. That's
|
||||
# not what we want, we want a module, so we add it here:
|
||||
node = self.convert_node(
|
||||
self._pgen_grammar,
|
||||
self._pgen_grammar.symbol2number['file_input'],
|
||||
self._pgen_grammar.nonterminal2number['file_input'],
|
||||
[node]
|
||||
)
|
||||
|
||||
@@ -107,24 +107,24 @@ class Parser(BaseParser):
|
||||
grammar rule produces a new complete node, so that the tree is build
|
||||
strictly bottom-up.
|
||||
"""
|
||||
# TODO REMOVE symbol, we don't want type here.
|
||||
symbol = pgen_grammar.number2symbol[type]
|
||||
# TODO REMOVE nonterminal, we don't want type here.
|
||||
nonterminal = pgen_grammar.number2nonterminal[type]
|
||||
try:
|
||||
return self.node_map[symbol](children)
|
||||
return self.node_map[nonterminal](children)
|
||||
except KeyError:
|
||||
if symbol == 'suite':
|
||||
if nonterminal == 'suite':
|
||||
# We don't want the INDENT/DEDENT in our parser tree. Those
|
||||
# leaves are just cancer. They are virtual leaves and not real
|
||||
# ones and therefore have pseudo start/end positions and no
|
||||
# prefixes. Just ignore them.
|
||||
children = [children[0]] + children[2:-1]
|
||||
elif symbol == 'list_if':
|
||||
elif nonterminal == 'list_if':
|
||||
# Make transitioning from 2 to 3 easier.
|
||||
symbol = 'comp_if'
|
||||
elif symbol == 'listmaker':
|
||||
nonterminal = 'comp_if'
|
||||
elif nonterminal == 'listmaker':
|
||||
# Same as list_if above.
|
||||
symbol = 'testlist_comp'
|
||||
return self.default_node(symbol, children)
|
||||
nonterminal = 'testlist_comp'
|
||||
return self.default_node(nonterminal, children)
|
||||
|
||||
def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
|
||||
# print('leaf', repr(value), token.tok_name[type])
|
||||
@@ -138,10 +138,10 @@ class Parser(BaseParser):
|
||||
|
||||
def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
|
||||
add_token_callback):
|
||||
def get_symbol_and_nodes(stack):
|
||||
def get_nonterminal_and_nodes(stack):
|
||||
for dfa, state, (type_, nodes) in stack:
|
||||
symbol = pgen_grammar.number2symbol[type_]
|
||||
yield symbol, nodes
|
||||
nonterminal = pgen_grammar.number2nonterminal[type_]
|
||||
yield nonterminal, nodes
|
||||
|
||||
tos_nodes = stack.get_tos_nodes()
|
||||
if tos_nodes:
|
||||
@@ -149,7 +149,7 @@ class Parser(BaseParser):
|
||||
else:
|
||||
last_leaf = None
|
||||
|
||||
if self._start_symbol == 'file_input' and \
|
||||
if self._start_nonterminal == 'file_input' and \
|
||||
(typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value):
|
||||
def reduce_stack(states, newstate):
|
||||
# reduce
|
||||
@@ -168,13 +168,13 @@ class Parser(BaseParser):
|
||||
ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value)
|
||||
|
||||
dfa, state, (type_, nodes) = stack[-1]
|
||||
symbol = pgen_grammar.number2symbol[type_]
|
||||
nonterminal = pgen_grammar.number2nonterminal[type_]
|
||||
states, first = dfa
|
||||
arcs = states[state]
|
||||
# Look for a state with this label
|
||||
for i, newstate in arcs:
|
||||
if ilabel == i:
|
||||
if symbol == 'simple_stmt':
|
||||
if nonterminal == 'simple_stmt':
|
||||
# This is basically shifting
|
||||
stack[-1] = (dfa, newstate, (type_, nodes))
|
||||
|
||||
@@ -182,12 +182,12 @@ class Parser(BaseParser):
|
||||
add_token_callback(typ, value, start_pos, prefix)
|
||||
return
|
||||
# Check if we're at the right point
|
||||
#for symbol, nodes in get_symbol_and_nodes(stack):
|
||||
#for nonterminal, nodes in get_nonterminal_and_nodes(stack):
|
||||
# self.pgen_parser._pop()
|
||||
|
||||
#break
|
||||
break
|
||||
#symbol = pgen_grammar.number2symbol[type_]
|
||||
#nonterminal = pgen_grammar.number2nonterminal[type_]
|
||||
|
||||
if not self._error_recovery:
|
||||
return super(Parser, self).error_recovery(
|
||||
@@ -198,21 +198,21 @@ class Parser(BaseParser):
|
||||
# For now just discard everything that is not a suite or
|
||||
# file_input, if we detect an error.
|
||||
one_line_suite = False
|
||||
for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))):
|
||||
for index, (nonterminal, nodes) in reversed(list(enumerate(get_nonterminal_and_nodes(stack)))):
|
||||
# `suite` can sometimes be only simple_stmt, not stmt.
|
||||
if one_line_suite:
|
||||
break
|
||||
elif symbol == 'file_input':
|
||||
elif nonterminal == 'file_input':
|
||||
break
|
||||
elif symbol == 'suite':
|
||||
elif nonterminal == 'suite':
|
||||
if len(nodes) > 1:
|
||||
break
|
||||
elif not nodes:
|
||||
one_line_suite = True
|
||||
# `suite` without an indent are error nodes.
|
||||
return index, symbol, nodes
|
||||
return index, nonterminal, nodes
|
||||
|
||||
index, symbol, nodes = current_suite(stack)
|
||||
index, nonterminal, nodes = current_suite(stack)
|
||||
|
||||
# print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
|
||||
if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos):
|
||||
@@ -226,11 +226,11 @@ class Parser(BaseParser):
|
||||
error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix)
|
||||
stack[-1][2][1].append(error_leaf)
|
||||
|
||||
if symbol == 'suite':
|
||||
if nonterminal == 'suite':
|
||||
dfa, state, node = stack[-1]
|
||||
states, first = dfa
|
||||
arcs = states[state]
|
||||
intended_label = pgen_grammar.symbol2label['stmt']
|
||||
intended_label = pgen_grammar.nonterminal2label['stmt']
|
||||
# Introduce a proper state transition. We're basically allowing
|
||||
# there to be no valid statements inside a suite.
|
||||
if [x[0] for x in arcs] == [intended_label]:
|
||||
|
||||
Reference in New Issue
Block a user