diff --git a/parso/pgen2/grammar.py b/parso/pgen2/grammar.py index 00a6e8c..86ed4d9 100644 --- a/parso/pgen2/grammar.py +++ b/parso/pgen2/grammar.py @@ -16,6 +16,8 @@ fallback token code OP, but the parser needs the actual token code. """ +from parso.python import token + class Grammar(object): """Pgen parsing tables conversion class. @@ -67,7 +69,10 @@ class Grammar(object): """ - def __init__(self, bnf_text, start_nonterminal): + def __init__(self, bnf_grammar, start_nonterminal, rule_to_dfas, token_namespace): + self._token_namespace = token_namespace + self._nonterminal_to_dfas = rule_to_dfas + self.nonterminal2number = {} self.number2nonterminal = {} self.states = [] @@ -79,6 +84,130 @@ class Grammar(object): self.label2nonterminal = {} self.start_nonterminal = start_nonterminal + self._make_grammar() + + def _make_grammar(self): + # Map from grammar rule (nonterminal) name to a set of tokens. + self._first_terminals = {} + + nonterminals = list(self._nonterminal_to_dfas.keys()) + nonterminals.sort() + for nonterminal in nonterminals: + if nonterminal not in self._first_terminals: + self._calculate_first_terminals(nonterminal) + + i = 256 + len(self.nonterminal2number) + self.nonterminal2number[nonterminal] = i + self.number2nonterminal[i] = nonterminal + + # Now that we have calculated the first terminals, we are sure that + # there is no left recursion or ambiguities. + + for nonterminal in nonterminals: + dfas = self._nonterminal_to_dfas[nonterminal] + states = [] + for state in dfas: + arcs = [] + for label, next_ in state.arcs.items(): + arcs.append((self._make_label(label), dfas.index(next_))) + if state.isfinal: + arcs.append((0, dfas.index(state))) + states.append(arcs) + self.states.append(states) + self.dfas[self.nonterminal2number[nonterminal]] = (states, self._make_first(nonterminal)) + + def _make_first(self, nonterminal): + rawfirst = self._first_terminals[nonterminal] + first = set() + for label in rawfirst: + ilabel = self._make_label(label) + ##assert ilabel not in first, "%s failed on <> ... !=" % label + first.add(ilabel) + return first + + def _make_label(self, label): + # XXX Maybe this should be a method on a subclass of converter? + ilabel = len(self.labels) + if label[0].isalpha(): + # Either a nonterminal name or a named token + if label in self.nonterminal2number: + # A nonterminal name + if label in self.nonterminal2label: + return self.nonterminal2label[label] + else: + self.labels.append((self.nonterminal2number[label], None)) + self.nonterminal2label[label] = ilabel + self.label2nonterminal[ilabel] = label + return ilabel + else: + # A named token (NAME, NUMBER, STRING) + itoken = getattr(self._token_namespace, label, None) + assert isinstance(itoken, int), label + if itoken in self.tokens: + return self.tokens[itoken] + else: + self.labels.append((itoken, None)) + self.tokens[itoken] = ilabel + return ilabel + else: + # Either a keyword or an operator + assert label[0] in ('"', "'"), label + value = eval(label) + if value[0].isalpha(): + # A keyword + if value in self.keywords: + return self.keywords[value] + else: + self.labels.append((token.NAME, value)) + self.keywords[value] = ilabel + return ilabel + else: + # An operator (any non-numeric token) + itoken = self._token_namespace.generate_token_id(value) + if itoken in self.tokens: + return self.tokens[itoken] + else: + self.labels.append((itoken, None)) + self.tokens[itoken] = ilabel + return ilabel + + def _calculate_first_terminals(self, nonterminal): + dfas = self._nonterminal_to_dfas[nonterminal] + self._first_terminals[nonterminal] = None # dummy to detect left recursion + # We only need to check the first dfa. All the following ones are not + # interesting to find first terminals. + state = dfas[0] + totalset = set() + overlapcheck = {} + for nonterminal_or_string, next_ in state.arcs.items(): + if nonterminal_or_string in self._nonterminal_to_dfas: + # It's a nonterminal and we have either a left recursion issue + # in the grammare or we have to recurse. + try: + fset = self._first_terminals[nonterminal_or_string] + except KeyError: + self._calculate_first_terminals(nonterminal_or_string) + fset = self._first_terminals[nonterminal_or_string] + else: + if fset is None: + raise ValueError("left recursion for rule %r" % nonterminal) + totalset.update(fset) + overlapcheck[nonterminal_or_string] = fset + else: + # It's a string. We have finally found a possible first token. + totalset.add(nonterminal_or_string) + overlapcheck[nonterminal_or_string] = set([nonterminal_or_string]) + + inverse = {} + for nonterminal_or_string, first_set in overlapcheck.items(): + for terminal in first_set: + if terminal in inverse: + raise ValueError("rule %s is ambiguous; %s is in the" + " first sets of %s as well as %s" % + (nonterminal, terminal, nonterminal_or_string, inverse[terminal])) + inverse[terminal] = nonterminal_or_string + self._first_terminals[nonterminal] = totalset + @property def start(self): return self.nonterminal2number[self.start_nonterminal] diff --git a/parso/pgen2/pgen.py b/parso/pgen2/pgen.py index 4480663..4453577 100644 --- a/parso/pgen2/pgen.py +++ b/parso/pgen2/pgen.py @@ -19,139 +19,9 @@ This grammar is self-referencing. """ from parso.pgen2.grammar import Grammar -from parso.python import token from parso.pgen2.grammar_parser import GrammarParser, NFAState -class ParserGenerator(object): - def __init__(self, rule_to_dfas, token_namespace): - self._token_namespace = token_namespace - self._nonterminal_to_dfas = rule_to_dfas - - def make_grammar(self, grammar): - # Map from grammar rule (nonterminal) name to a set of tokens. - self._first_terminals = {} - - nonterminals = list(self._nonterminal_to_dfas.keys()) - nonterminals.sort() - for nonterminal in nonterminals: - if nonterminal not in self._first_terminals: - self._calculate_first_terminals(nonterminal) - - i = 256 + len(grammar.nonterminal2number) - grammar.nonterminal2number[nonterminal] = i - grammar.number2nonterminal[i] = nonterminal - - # Now that we have calculated the first terminals, we are sure that - # there is no left recursion or ambiguities. - - for nonterminal in nonterminals: - dfas = self._nonterminal_to_dfas[nonterminal] - states = [] - for state in dfas: - arcs = [] - for label, next_ in state.arcs.items(): - arcs.append((self._make_label(grammar, label), dfas.index(next_))) - if state.isfinal: - arcs.append((0, dfas.index(state))) - states.append(arcs) - grammar.states.append(states) - grammar.dfas[grammar.nonterminal2number[nonterminal]] = (states, self._make_first(grammar, nonterminal)) - return grammar - - def _make_first(self, grammar, nonterminal): - rawfirst = self._first_terminals[nonterminal] - first = set() - for label in rawfirst: - ilabel = self._make_label(grammar, label) - ##assert ilabel not in first, "%s failed on <> ... !=" % label - first.add(ilabel) - return first - - def _make_label(self, grammar, label): - # XXX Maybe this should be a method on a subclass of converter? - ilabel = len(grammar.labels) - if label[0].isalpha(): - # Either a nonterminal name or a named token - if label in grammar.nonterminal2number: - # A nonterminal name - if label in grammar.nonterminal2label: - return grammar.nonterminal2label[label] - else: - grammar.labels.append((grammar.nonterminal2number[label], None)) - grammar.nonterminal2label[label] = ilabel - grammar.label2nonterminal[ilabel] = label - return ilabel - else: - # A named token (NAME, NUMBER, STRING) - itoken = getattr(self._token_namespace, label, None) - assert isinstance(itoken, int), label - if itoken in grammar.tokens: - return grammar.tokens[itoken] - else: - grammar.labels.append((itoken, None)) - grammar.tokens[itoken] = ilabel - return ilabel - else: - # Either a keyword or an operator - assert label[0] in ('"', "'"), label - value = eval(label) - if value[0].isalpha(): - # A keyword - if value in grammar.keywords: - return grammar.keywords[value] - else: - grammar.labels.append((token.NAME, value)) - grammar.keywords[value] = ilabel - return ilabel - else: - # An operator (any non-numeric token) - itoken = self._token_namespace.generate_token_id(value) - if itoken in grammar.tokens: - return grammar.tokens[itoken] - else: - grammar.labels.append((itoken, None)) - grammar.tokens[itoken] = ilabel - return ilabel - - def _calculate_first_terminals(self, nonterminal): - dfas = self._nonterminal_to_dfas[nonterminal] - self._first_terminals[nonterminal] = None # dummy to detect left recursion - # We only need to check the first dfa. All the following ones are not - # interesting to find first terminals. - state = dfas[0] - totalset = set() - overlapcheck = {} - for nonterminal_or_string, next_ in state.arcs.items(): - if nonterminal_or_string in self._nonterminal_to_dfas: - # It's a nonterminal and we have either a left recursion issue - # in the grammare or we have to recurse. - try: - fset = self._first_terminals[nonterminal_or_string] - except KeyError: - self._calculate_first_terminals(nonterminal_or_string) - fset = self._first_terminals[nonterminal_or_string] - else: - if fset is None: - raise ValueError("left recursion for rule %r" % nonterminal) - totalset.update(fset) - overlapcheck[nonterminal_or_string] = fset - else: - # It's a string. We have finally found a possible first token. - totalset.add(nonterminal_or_string) - overlapcheck[nonterminal_or_string] = set([nonterminal_or_string]) - - inverse = {} - for nonterminal_or_string, first_set in overlapcheck.items(): - for terminal in first_set: - if terminal in inverse: - raise ValueError("rule %s is ambiguous; %s is in the" - " first sets of %s as well as %s" % - (nonterminal, terminal, nonterminal_or_string, inverse[terminal])) - inverse[terminal] = nonterminal_or_string - self._first_terminals[nonterminal] = totalset - - class DFAState(object): def __init__(self, from_rule, nfa_set, final): assert isinstance(nfa_set, set) @@ -308,5 +178,4 @@ def generate_grammar(bnf_grammar, token_namespace): if start_nonterminal is None: start_nonterminal = nfa_a.from_rule - p = ParserGenerator(rule_to_dfas, token_namespace) - return p.make_grammar(Grammar(bnf_grammar, start_nonterminal)) + return Grammar(bnf_grammar, start_nonterminal, rule_to_dfas, token_namespace)