Make pgen2's grammar ambiguity detection handle more cases

Under the old implementation,

```
outer: A [inner] B C
inner: B C [inner]
```

wouldn't get detected as the ambiguous grammar that it is, whereas

```
outer: A rest
rest: [inner] B C
inner: B C [inner]
```

would.

This would manifest itself as non-determinism in the DFA state
generation. See the discussion #62 on for a full explanation.

This modifies the ambiguity detection to work on a broader class of
issues, so it should now hopefully detect all cases where the given
grammar is ambiguous.

At some point, we could extend this logic to allow developers to
optionally set precedence of grammar productions, which could resolve
ambiguities, but that's not a strict requirement for parsing python.
This commit is contained in:
Benjamin Woodruff
2019-05-07 16:56:29 -04:00
committed by Dave Halter
parent c0ace63a69
commit 0032bae041
2 changed files with 60 additions and 12 deletions

View File

@@ -309,13 +309,39 @@ def _calculate_tree_traversal(nonterminal_to_dfas):
_calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal) _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal)
# Now that we have calculated the first terminals, we are sure that # Now that we have calculated the first terminals, we are sure that
# there is no left recursion or ambiguities. # there is no left recursion.
for dfas in nonterminal_to_dfas.values(): for dfas in nonterminal_to_dfas.values():
for dfa_state in dfas: for dfa_state in dfas:
transitions = dfa_state.transitions
for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items(): for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items():
for transition, pushes in first_plans[nonterminal].items(): for transition, pushes in first_plans[nonterminal].items():
dfa_state.transitions[transition] = DFAPlan(next_dfa, pushes) if transition in transitions:
prev_plan = transitions[transition]
# Make sure these are sorted so that error messages are
# at least deterministic
choices = sorted([
(
prev_plan.dfa_pushes[0].from_rule
if prev_plan.dfa_pushes
else prev_plan.next_dfa.from_rule
),
(
pushes[0].from_rule
if pushes else next_dfa.from_rule
),
])
raise ValueError(
"Rule %s is ambiguous; given a %s token, we "
"can't determine if we should evaluate %s or %s."
% (
(
dfa_state.from_rule,
transition,
) + tuple(choices)
)
)
transitions[transition] = DFAPlan(next_dfa, pushes)
def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal): def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal):
@@ -345,13 +371,6 @@ def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal):
raise ValueError("left recursion for rule %r" % nonterminal) raise ValueError("left recursion for rule %r" % nonterminal)
for t, pushes in first_plans2.items(): for t, pushes in first_plans2.items():
check = new_first_plans.get(t)
if check is not None:
raise ValueError(
"Rule %s is ambiguous; %s is the"
" start of the rule %s as well as %s."
% (nonterminal, t, nonterminal2, check[-1].from_rule)
)
new_first_plans[t] = [next_] + pushes new_first_plans[t] = [next_] + pushes
first_plans[nonterminal] = new_first_plans first_plans[nonterminal] = new_first_plans

View File

@@ -293,11 +293,40 @@ def test_left_recursion():
def test_ambiguities(): def test_ambiguities():
with pytest.raises(ValueError, match='ambiguous'): with pytest.raises(
ValueError,
match=r"foo is ambiguous.*given a TokenType\(NAME\).*bar or baz"
):
generate_grammar('foo: bar | baz\nbar: NAME\nbaz: NAME\n', tokenize.PythonTokenTypes) generate_grammar('foo: bar | baz\nbar: NAME\nbaz: NAME\n', tokenize.PythonTokenTypes)
with pytest.raises(ValueError, match='ambiguous'): with pytest.raises(
ValueError,
match=r"foo is ambiguous.*given a ReservedString\(x\).*bar or baz"
):
generate_grammar('''foo: bar | baz\nbar: 'x'\nbaz: "x"\n''', tokenize.PythonTokenTypes) generate_grammar('''foo: bar | baz\nbar: 'x'\nbaz: "x"\n''', tokenize.PythonTokenTypes)
with pytest.raises(ValueError, match='ambiguous'): with pytest.raises(
ValueError,
match=r"foo is ambiguous.*given a ReservedString\(x\).*bar or foo"
):
generate_grammar('''foo: bar | 'x'\nbar: 'x'\n''', tokenize.PythonTokenTypes) generate_grammar('''foo: bar | 'x'\nbar: 'x'\n''', tokenize.PythonTokenTypes)
# an ambiguity with the second (not the first) child of a production
with pytest.raises(
ValueError,
match=r"outer is ambiguous.*given a ReservedString\(b\).*inner or outer"
):
generate_grammar(
'outer: "a" [inner] "b" "c"\ninner: "b" "c" [inner]\n',
tokenize.PythonTokenTypes,
)
# an ambiguity hidden by a level of indirection (middle)
with pytest.raises(
ValueError,
match=r"outer is ambiguous.*given a ReservedString\(b\).*middle or outer"
):
generate_grammar(
'outer: "a" [middle] "b" "c"\nmiddle: inner\ninner: "b" "c" [inner]\n',
tokenize.PythonTokenTypes,
)