From a46ecbb49912153c67e484b13da21bb2909bd109 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Tue, 26 Jun 2018 00:58:19 +0200 Subject: [PATCH] Fix an ambiguity issue Unfortunately had to refactor most of the transition generation --- parso/pgen2/grammar.py | 97 ++++++++++++++---------------------------- parso/pgen2/pgen.py | 46 +++++++++++++++++++- test/test_pgen2.py | 3 ++ 3 files changed, 80 insertions(+), 66 deletions(-) diff --git a/parso/pgen2/grammar.py b/parso/pgen2/grammar.py index c8aaaad..9409c4a 100644 --- a/parso/pgen2/grammar.py +++ b/parso/pgen2/grammar.py @@ -16,8 +16,6 @@ fallback token code OP, but the parser needs the actual token code. """ -from ast import literal_eval - class DFAPlan(object): def __init__(self, next_dfa, dfa_pushes=[]): @@ -28,11 +26,6 @@ class DFAPlan(object): return '%s(%s, %s)' % (self.__class__.__name__, self.next_dfa, self.dfa_pushes) -class ReservedString(object): - def __init__(self, value): - self.value = value - - class Grammar(object): """Pgen parsing tables conversion class. @@ -43,11 +36,10 @@ class Grammar(object): do this (see the conv and pgen modules). """ - def __init__(self, bnf_grammar, start_nonterminal, rule_to_dfas, token_namespace): - self._token_namespace = token_namespace + def __init__(self, bnf_grammar, start_nonterminal, rule_to_dfas, reserved_syntax_strings): self._nonterminal_to_dfas = rule_to_dfas - self.reserved_syntax_strings = {} + self.reserved_syntax_strings = reserved_syntax_strings self.start_nonterminal = start_nonterminal self._make_grammar() @@ -68,33 +60,10 @@ class Grammar(object): for dfas in self._nonterminal_to_dfas.values(): for dfa_state in dfas: - dfa_state.ilabel_to_plan = plans = {} - for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items(): - if terminal_or_nonterminal in self._nonterminal_to_dfas: - for t, pushes in self._first_plans[terminal_or_nonterminal].items(): - plans[self._make_label(t)] = DFAPlan(next_dfa, pushes) - else: - ilabel = self._make_label(terminal_or_nonterminal) - plans[ilabel] = DFAPlan(next_dfa) - - def _make_label(self, label): - if label[0].isalpha(): - # Either a nonterminal name or a named token - assert label not in self._nonterminal_to_dfas - - # A named token (e.g. NAME, NUMBER, STRING) - token_type = getattr(self._token_namespace, label, None) - return token_type - else: - # Either a keyword or an operator - assert label[0] in ('"', "'"), label - # TODO use literal_eval instead of a simple eval. - value = literal_eval(label) - try: - return self.reserved_syntax_strings[value] - except KeyError: - r = self.reserved_syntax_strings[value] = ReservedString(value) - return r + for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items(): + for transition, pushes in self._first_plans[nonterminal].items(): + dfa_state.ilabel_to_plan[transition] = DFAPlan(next_dfa, pushes) + #print(dfa_state.from_rule, dfa_state.ilabel_to_plan) def _calculate_first_terminals(self, nonterminal): dfas = self._nonterminal_to_dfas[nonterminal] @@ -105,35 +74,35 @@ class Grammar(object): state = dfas[0] totalset = set() overlapcheck = {} - for nonterminal_or_string, next_ in state.arcs.items(): - if nonterminal_or_string in self._nonterminal_to_dfas: - # It's a nonterminal and we have either a left recursion issue - # in the grammar or we have to recurse. - try: - fset = self._first_terminals[nonterminal_or_string] - except KeyError: - self._calculate_first_terminals(nonterminal_or_string) - fset = self._first_terminals[nonterminal_or_string] - else: - if fset is None: - raise ValueError("left recursion for rule %r" % nonterminal) - totalset.update(fset) - overlapcheck[nonterminal_or_string] = fset + for transition, next_ in state.ilabel_to_plan.items(): + # It's a string. We have finally found a possible first token. + totalset.add(transition) + #overlapcheck[nonterminal] = set([transition]) + first_plans[transition] = [next_.next_dfa] - for t, pushes in self._first_plans[nonterminal_or_string].items(): - check = first_plans.get(t) - if check is not None: - raise ValueError( - "Rule %s is ambiguous; %s is the" - " start of the rule %s as well as %s." - % (nonterminal, t, nonterminal_or_string, check[-1].from_rule) - ) - first_plans[t] = [next_] + pushes + for nonterminal2, next_ in state.nonterminal_arcs.items(): + # It's a nonterminal and we have either a left recursion issue + # in the grammar or we have to recurse. + try: + fset = self._first_terminals[nonterminal2] + except KeyError: + self._calculate_first_terminals(nonterminal2) + fset = self._first_terminals[nonterminal2] else: - # It's a string. We have finally found a possible first token. - totalset.add(nonterminal_or_string) - overlapcheck[nonterminal_or_string] = set([nonterminal_or_string]) - first_plans[nonterminal_or_string] = [next_] + if fset is None: + raise ValueError("left recursion for rule %r" % nonterminal) + totalset.update(fset) + overlapcheck[nonterminal2] = fset + + for t, pushes in self._first_plans[nonterminal2].items(): + check = first_plans.get(t) + if check is not None: + raise ValueError( + "Rule %s is ambiguous; %s is the" + " start of the rule %s as well as %s." + % (nonterminal, t, nonterminal2, check[-1].from_rule) + ) + first_plans[t] = [next_] + pushes inverse = {} for nonterminal_or_string, first_set in overlapcheck.items(): diff --git a/parso/pgen2/pgen.py b/parso/pgen2/pgen.py index e194c3f..3dd9f2a 100644 --- a/parso/pgen2/pgen.py +++ b/parso/pgen2/pgen.py @@ -18,7 +18,9 @@ Specifying grammars in pgen is possible with this grammar:: This grammar is self-referencing. """ -from parso.pgen2.grammar import Grammar +from ast import literal_eval + +from parso.pgen2.grammar import Grammar, DFAPlan from parso.pgen2.grammar_parser import GrammarParser, NFAState @@ -32,6 +34,7 @@ class DFAState(object): self.is_final = final in nfa_set self.arcs = {} # map from terminals/nonterminals to DFAState self.ilabel_to_plan = {} + self.nonterminal_arcs = {} def add_arc(self, next_, label): assert isinstance(label, str) @@ -66,6 +69,14 @@ class DFAState(object): ) +class ReservedString(object): + def __init__(self, value): + self.value = value + + def __repr__(self): + return '%s(%s)' % (self.__class__.__name__, self.value) + + def _simplify_dfas(dfas): # This is not theoretically optimal, but works well enough. # Algorithm: repeatedly look for two states that have the same @@ -184,4 +195,35 @@ def generate_grammar(bnf_grammar, token_namespace): if start_nonterminal is None: start_nonterminal = nfa_a.from_rule - return Grammar(bnf_grammar, start_nonterminal, rule_to_dfas, token_namespace) + reserved_strings = {} + for nonterminal, dfas in rule_to_dfas.items(): + for dfa_state in dfas: + for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items(): + if terminal_or_nonterminal in rule_to_dfas: + dfa_state.nonterminal_arcs[terminal_or_nonterminal] = next_dfa + else: + transition = _make_transition( + token_namespace, + reserved_strings, + terminal_or_nonterminal + ) + dfa_state.ilabel_to_plan[transition] = DFAPlan(next_dfa) + + return Grammar(bnf_grammar, start_nonterminal, rule_to_dfas, reserved_strings) + + +def _make_transition(token_namespace, reserved_syntax_strings, label): + if label[0].isalpha(): + # A named token (e.g. NAME, NUMBER, STRING) + return getattr(token_namespace, label) + else: + # Either a keyword or an operator + assert label[0] in ('"', "'"), label + assert not label.startswith('"""') and not label.startswith("'''") + # TODO use literal_eval instead of a simple eval. + value = literal_eval(label) + try: + return reserved_syntax_strings[value] + except KeyError: + r = reserved_syntax_strings[value] = ReservedString(value) + return r diff --git a/test/test_pgen2.py b/test/test_pgen2.py index 88f6591..4dba172 100644 --- a/test/test_pgen2.py +++ b/test/test_pgen2.py @@ -282,3 +282,6 @@ def test_left_recursion(): def test_ambiguities(): with pytest.raises(ValueError, match='ambiguous'): generate_grammar('foo: bar | baz\nbar: NAME\nbaz: NAME\n', tokenize.PythonTokenTypes) + + with pytest.raises(ValueError, match='ambiguous'): + generate_grammar('''foo: bar | baz\nbar: 'x'\nbaz: "x"\n''', tokenize.PythonTokenTypes)