From 0032bae041075a7439589b81ef4ba1b9133a7716 Mon Sep 17 00:00:00 2001 From: Benjamin Woodruff Date: Tue, 7 May 2019 16:56:29 -0400 Subject: [PATCH] Make pgen2's grammar ambiguity detection handle more cases Under the old implementation, ``` outer: A [inner] B C inner: B C [inner] ``` wouldn't get detected as the ambiguous grammar that it is, whereas ``` outer: A rest rest: [inner] B C inner: B C [inner] ``` would. This would manifest itself as non-determinism in the DFA state generation. See the discussion #62 on for a full explanation. This modifies the ambiguity detection to work on a broader class of issues, so it should now hopefully detect all cases where the given grammar is ambiguous. At some point, we could extend this logic to allow developers to optionally set precedence of grammar productions, which could resolve ambiguities, but that's not a strict requirement for parsing python. --- parso/pgen2/generator.py | 37 ++++++++++++++++++++++++++++--------- test/test_pgen2.py | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 60 insertions(+), 12 deletions(-) diff --git a/parso/pgen2/generator.py b/parso/pgen2/generator.py index 184afb3..359a94f 100644 --- a/parso/pgen2/generator.py +++ b/parso/pgen2/generator.py @@ -309,13 +309,39 @@ def _calculate_tree_traversal(nonterminal_to_dfas): _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal) # Now that we have calculated the first terminals, we are sure that - # there is no left recursion or ambiguities. + # there is no left recursion. for dfas in nonterminal_to_dfas.values(): for dfa_state in dfas: + transitions = dfa_state.transitions for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items(): for transition, pushes in first_plans[nonterminal].items(): - dfa_state.transitions[transition] = DFAPlan(next_dfa, pushes) + if transition in transitions: + prev_plan = transitions[transition] + # Make sure these are sorted so that error messages are + # at least deterministic + choices = sorted([ + ( + prev_plan.dfa_pushes[0].from_rule + if prev_plan.dfa_pushes + else prev_plan.next_dfa.from_rule + ), + ( + pushes[0].from_rule + if pushes else next_dfa.from_rule + ), + ]) + raise ValueError( + "Rule %s is ambiguous; given a %s token, we " + "can't determine if we should evaluate %s or %s." + % ( + ( + dfa_state.from_rule, + transition, + ) + tuple(choices) + ) + ) + transitions[transition] = DFAPlan(next_dfa, pushes) def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal): @@ -345,13 +371,6 @@ def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal): raise ValueError("left recursion for rule %r" % nonterminal) for t, pushes in first_plans2.items(): - check = new_first_plans.get(t) - if check is not None: - raise ValueError( - "Rule %s is ambiguous; %s is the" - " start of the rule %s as well as %s." - % (nonterminal, t, nonterminal2, check[-1].from_rule) - ) new_first_plans[t] = [next_] + pushes first_plans[nonterminal] = new_first_plans diff --git a/test/test_pgen2.py b/test/test_pgen2.py index c83dff7..6ec081b 100644 --- a/test/test_pgen2.py +++ b/test/test_pgen2.py @@ -293,11 +293,40 @@ def test_left_recursion(): def test_ambiguities(): - with pytest.raises(ValueError, match='ambiguous'): + with pytest.raises( + ValueError, + match=r"foo is ambiguous.*given a TokenType\(NAME\).*bar or baz" + ): generate_grammar('foo: bar | baz\nbar: NAME\nbaz: NAME\n', tokenize.PythonTokenTypes) - with pytest.raises(ValueError, match='ambiguous'): + with pytest.raises( + ValueError, + match=r"foo is ambiguous.*given a ReservedString\(x\).*bar or baz" + ): generate_grammar('''foo: bar | baz\nbar: 'x'\nbaz: "x"\n''', tokenize.PythonTokenTypes) - with pytest.raises(ValueError, match='ambiguous'): + with pytest.raises( + ValueError, + match=r"foo is ambiguous.*given a ReservedString\(x\).*bar or foo" + ): generate_grammar('''foo: bar | 'x'\nbar: 'x'\n''', tokenize.PythonTokenTypes) + + # an ambiguity with the second (not the first) child of a production + with pytest.raises( + ValueError, + match=r"outer is ambiguous.*given a ReservedString\(b\).*inner or outer" + ): + generate_grammar( + 'outer: "a" [inner] "b" "c"\ninner: "b" "c" [inner]\n', + tokenize.PythonTokenTypes, + ) + + # an ambiguity hidden by a level of indirection (middle) + with pytest.raises( + ValueError, + match=r"outer is ambiguous.*given a ReservedString\(b\).*middle or outer" + ): + generate_grammar( + 'outer: "a" [middle] "b" "c"\nmiddle: inner\ninner: "b" "c" [inner]\n', + tokenize.PythonTokenTypes, + )