Fix a few things so that the tokenizer can at least parse the grammar.

This commit is contained in:
Dave Halter
2018-03-30 22:13:18 +02:00
parent d8d2e596a5
commit 235fda3fbb
3 changed files with 31 additions and 22 deletions

View File

@@ -28,6 +28,7 @@ class ParserGenerator(object):
c = grammar.Grammar(self._bnf_text) c = grammar.Grammar(self._bnf_text)
names = list(self.dfas.keys()) names = list(self.dfas.keys())
names.sort() names.sort()
# TODO do we still need this?
names.remove(self.startsymbol) names.remove(self.startsymbol)
names.insert(0, self.startsymbol) names.insert(0, self.startsymbol)
for name in names: for name in names:
@@ -316,8 +317,8 @@ class ParserGenerator(object):
def _expect(self, type): def _expect(self, type):
if self.type != type: if self.type != type:
self._raise_error("expected %s, got %s(%s)", self._raise_error("expected %s(%s), got %s(%s)",
type, self.type, self.value) type, token.tok_name[type], self.type, self.value)
value = self.value value = self.value
self._gettoken() self._gettoken()
return value return value

View File

@@ -38,6 +38,8 @@ FSTRING_END = next(_counter)
tok_name[FSTRING_END] = 'FSTRING_END' tok_name[FSTRING_END] = 'FSTRING_END'
FSTRING_STRING = next(_counter) FSTRING_STRING = next(_counter)
tok_name[FSTRING_STRING] = 'FSTRING_STRING' tok_name[FSTRING_STRING] = 'FSTRING_STRING'
EXCLAMATION = next(_counter)
tok_name[EXCLAMATION] = 'EXCLAMATION'
# Map from operator to number (since tokenize doesn't do this) # Map from operator to number (since tokenize doesn't do this)
@@ -90,6 +92,7 @@ opmap_raw = """\
//= DOUBLESLASHEQUAL //= DOUBLESLASHEQUAL
-> RARROW -> RARROW
... ELLIPSIS ... ELLIPSIS
! EXCLAMATION
""" """
opmap = {} opmap = {}

View File

@@ -28,7 +28,7 @@ from parso.utils import split_lines
TokenCollection = namedtuple( TokenCollection = namedtuple(
'TokenCollection', 'TokenCollection',
'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens', 'pseudo_token single_quoted triple_quoted endpats fstring_pattern_map always_break_tokens',
) )
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
@@ -53,32 +53,35 @@ def group(*choices, **kwargs):
return start + '|'.join(choices) + ')' return start + '|'.join(choices) + ')'
def any(*choices):
return group(*choices) + '*'
def maybe(*choices): def maybe(*choices):
return group(*choices) + '?' return group(*choices) + '?'
# Return the empty string, plus all of the valid string prefixes. # Return the empty string, plus all of the valid string prefixes.
def _all_string_prefixes(version_info, include_fstring=False): def _all_string_prefixes(version_info, include_fstring=False, only_fstring=False):
def different_case_versions(prefix): def different_case_versions(prefix):
for s in _itertools.product(*[(c, c.upper()) for c in prefix]): for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
yield ''.join(s) yield ''.join(s)
# The valid string prefixes. Only contain the lower case versions, # The valid string prefixes. Only contain the lower case versions,
# and don't contain any permuations (include 'fr', but not # and don't contain any permuations (include 'fr', but not
# 'rf'). The various permutations will be generated. # 'rf'). The various permutations will be generated.
_valid_string_prefixes = ['b', 'r', 'u'] valid_string_prefixes = ['b', 'r', 'u']
if version_info >= (3, 0): if version_info >= (3, 0):
_valid_string_prefixes.append('br') valid_string_prefixes.append('br')
result = {''}
if version_info >= (3, 6) and include_fstring: if version_info >= (3, 6) and include_fstring:
_valid_string_prefixes += ['f', 'fr'] f = ['f', 'fr']
if only_fstring:
valid_string_prefixes = f
result = set()
else:
valid_string_prefixes += f
elif only_fstring:
return set()
# if we add binary f-strings, add: ['fb', 'fbr'] # if we add binary f-strings, add: ['fb', 'fbr']
result = set(['']) for prefix in valid_string_prefixes:
for prefix in _valid_string_prefixes:
for t in _itertools.permutations(prefix): for t in _itertools.permutations(prefix):
# create a list with upper and lower versions of each # create a list with upper and lower versions of each
# character # character
@@ -183,10 +186,11 @@ def _create_token_collection(version_info):
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n')) group('"', r'\\\r?\n'))
pseudo_extra_pool = [Comment, Triple] pseudo_extra_pool = [Comment, Triple]
all_quotes = '"', "'", '"""', "'''"
if fstring_prefixes: if fstring_prefixes:
pseudo_extra_pool.append(FStringStart) pseudo_extra_pool.append(FStringStart + group(*all_quotes))
PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool) PseudoExtras = group(r'\\\r?\n|\Z', *pseudo_extra_pool)
PseudoToken = group(Whitespace, capture=True) + \ PseudoToken = group(Whitespace, capture=True) + \
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
@@ -204,7 +208,7 @@ def _create_token_collection(version_info):
# including the opening quotes. # including the opening quotes.
single_quoted = set() single_quoted = set()
triple_quoted = set() triple_quoted = set()
fstring_endpats = {} fstring_pattern_map = {}
for t in possible_prefixes: for t in possible_prefixes:
for quote in '"', "'": for quote in '"', "'":
single_quoted.add(t + quote) single_quoted.add(t + quote)
@@ -213,15 +217,15 @@ def _create_token_collection(version_info):
triple_quoted.add(t + quote) triple_quoted.add(t + quote)
for t in fstring_prefixes: for t in fstring_prefixes:
for quote in '"', "'", '"""', "'''": for quote in all_quotes:
fstring_endpats[t + quote] = quote fstring_pattern_map[t + quote] = quote
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
'finally', 'while', 'with', 'return') 'finally', 'while', 'with', 'return')
pseudo_token_compiled = _compile(PseudoToken) pseudo_token_compiled = _compile(PseudoToken)
return TokenCollection( return TokenCollection(
pseudo_token_compiled, single_quoted, triple_quoted, endpats, pseudo_token_compiled, single_quoted, triple_quoted, endpats,
fstring_endpats, ALWAYS_BREAK_TOKENS fstring_pattern_map, ALWAYS_BREAK_TOKENS
) )
@@ -324,7 +328,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
token. This idea comes from lib2to3. The prefix contains all information token. This idea comes from lib2to3. The prefix contains all information
that is irrelevant for the parser like newlines in parentheses or comments. that is irrelevant for the parser like newlines in parentheses or comments.
""" """
pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \ pseudo_token, single_quoted, triple_quoted, endpats, fstring_pattern_map, always_break_tokens, = \
_get_token_collection(version_info) _get_token_collection(version_info)
paren_level = 0 # count parentheses paren_level = 0 # count parentheses
indents = [0] indents = [0]
@@ -372,6 +376,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
continue continue
while pos < max: while pos < max:
assert not fstring_stack
if fstring_stack: if fstring_stack:
string, pos = _find_fstring_string(fstring_stack, line, pos) string, pos = _find_fstring_string(fstring_stack, line, pos)
if string: if string:
@@ -474,8 +479,8 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
break break
else: # ordinary string else: # ordinary string
yield PythonToken(STRING, token, spos, prefix) yield PythonToken(STRING, token, spos, prefix)
elif token in fstring_endpats: elif token in fstring_pattern_map: # The start of an fstring.
fstring_stack.append(FStringNode(fstring_endpats[token])) fstring_stack.append(FStringNode(fstring_pattern_map[token]))
yield PythonToken(FSTRING_START, token, spos, prefix) yield PythonToken(FSTRING_START, token, spos, prefix)
elif is_identifier(initial): # ordinary name elif is_identifier(initial): # ordinary name
if token in always_break_tokens: if token in always_break_tokens: