Use token.OP and use reserved words

This change breaks the tokenizer backwards compatibility a bit. Details of
operators is now part of the parser and not the tokenizer anymore. The parser
does this anyway, so we don't need the complexity in the tokenizer.
This commit is contained in:
Dave Halter
2018-06-24 11:28:23 +02:00
parent 33e321a539
commit b5378e4602
5 changed files with 19 additions and 28 deletions

View File

@@ -59,7 +59,7 @@ class Grammar(object):
self._nonterminal_to_dfas = rule_to_dfas
self.labels = [(0, "EMPTY")]
self.keywords = {}
self.reserved_syntax_strings = {}
self.tokens = {}
self.start_nonterminal = start_nonterminal
@@ -104,7 +104,6 @@ class Grammar(object):
#@_cache_labels
def _make_label(self, label):
# XXX Maybe this should be a method on a subclass of converter?
ilabel = len(self.labels)
if label[0].isalpha():
# Either a nonterminal name or a named token
@@ -124,23 +123,12 @@ class Grammar(object):
assert label[0] in ('"', "'"), label
# TODO use literal_eval instead of a simple eval.
value = eval(label)
if value[0].isalpha():
# A keyword
if value in self.keywords:
return self.keywords[value]
if value in self.reserved_syntax_strings:
return self.reserved_syntax_strings[value]
else:
self.labels.append((token.NAME, value))
self.keywords[value] = ilabel
return ilabel
else:
# An operator (any non-numeric token)
itoken = self._token_namespace.generate_token_id(value)
if itoken in self.tokens:
return self.tokens[itoken]
else:
self.labels.append((itoken, None))
self.tokens[itoken] = ilabel
return ilabel
self.reserved_syntax_strings[value] = ilabel
return self.reserved_syntax_strings[value]
def _calculate_first_terminals(self, nonterminal):
dfas = self._nonterminal_to_dfas[nonterminal]

View File

@@ -30,7 +30,7 @@ class GrammarParser():
# rule: NAME ':' rhs NEWLINE
self._current_rule_name = self._expect(token.NAME)
self._expect(token.COLON)
self._expect(token.OP, ':')
a, z = self._parse_rhs()
self._expect(token.NEWLINE)
@@ -60,7 +60,7 @@ class GrammarParser():
def _parse_items(self):
# items: item+
a, b = self._parse_item()
while self.type in (token.NAME, token.STRING, token.LPAR, token.LSQB):
while self.type in (token.NAME, token.STRING) or self.value in ('(', '['):
c, d = self._parse_item()
# Need to end on the next item.
b.add_arc(c)
@@ -72,7 +72,7 @@ class GrammarParser():
if self.value == "[":
self._gettoken()
a, z = self._parse_rhs()
self._expect(token.RSQB)
self._expect(token.OP, ']')
# Make it also possible that there is no token and change the
# state.
a.add_arc(z)
@@ -97,7 +97,7 @@ class GrammarParser():
if self.value == "(":
self._gettoken()
a, z = self._parse_rhs()
self._expect(token.RPAR)
self._expect(token.OP, ')')
return a, z
elif self.type in (token.NAME, token.STRING):
a = NFAState(self._current_rule_name)
@@ -110,10 +110,12 @@ class GrammarParser():
self._raise_error("expected (...) or NAME or STRING, got %s/%s",
self.type, self.value)
def _expect(self, type):
def _expect(self, type, value=None):
if self.type != type:
self._raise_error("expected %s(%s), got %s(%s)",
type, token.tok_name[type], self.type, self.value)
if value is not None and self.value != value:
self._raise_error("expected %s, got %s", value, self.value)
value = self.value
self._gettoken()
return value

View File

@@ -71,10 +71,10 @@ def token_to_ilabel(grammar, type_, value):
# Map from token to label
# TODO this is not good, shouldn't use tokenize.NAME, but somehow use the
# grammar.
if type_ == tokenize.NAME:
if type_ in (tokenize.NAME, tokenize.OP):
# Check for reserved words (keywords)
try:
return grammar.keywords[value]
return grammar.reserved_syntax_strings[value]
except KeyError:
pass

View File

@@ -127,7 +127,7 @@ class Parser(BaseParser):
def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
# print('leaf', repr(value), token.tok_name[type])
if type == NAME:
if value in pgen_grammar.keywords:
if value in pgen_grammar.reserved_syntax_strings:
return tree.Keyword(value, start_pos, prefix)
else:
return tree.Name(value, start_pos, prefix)

View File

@@ -21,7 +21,7 @@ from codecs import BOM_UTF8
from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
ERROR_DEDENT, FSTRING_STRING, FSTRING_START,
FSTRING_END)
FSTRING_END, OP)
from parso._compatibility import py_version
from parso.utils import split_lines
@@ -574,7 +574,8 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
try:
# This check is needed in any case to check if it's a valid
# operator or just some random unicode character.
typ = opmap[token]
opmap[token]
typ = OP
except KeyError:
typ = ERRORTOKEN
yield PythonToken(typ, token, spos, prefix)