mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-08 13:45:01 +08:00
Fix a few things so that the tokenizer can at least parse the grammar.
This commit is contained in:
@@ -28,6 +28,7 @@ class ParserGenerator(object):
|
|||||||
c = grammar.Grammar(self._bnf_text)
|
c = grammar.Grammar(self._bnf_text)
|
||||||
names = list(self.dfas.keys())
|
names = list(self.dfas.keys())
|
||||||
names.sort()
|
names.sort()
|
||||||
|
# TODO do we still need this?
|
||||||
names.remove(self.startsymbol)
|
names.remove(self.startsymbol)
|
||||||
names.insert(0, self.startsymbol)
|
names.insert(0, self.startsymbol)
|
||||||
for name in names:
|
for name in names:
|
||||||
@@ -316,8 +317,8 @@ class ParserGenerator(object):
|
|||||||
|
|
||||||
def _expect(self, type):
|
def _expect(self, type):
|
||||||
if self.type != type:
|
if self.type != type:
|
||||||
self._raise_error("expected %s, got %s(%s)",
|
self._raise_error("expected %s(%s), got %s(%s)",
|
||||||
type, self.type, self.value)
|
type, token.tok_name[type], self.type, self.value)
|
||||||
value = self.value
|
value = self.value
|
||||||
self._gettoken()
|
self._gettoken()
|
||||||
return value
|
return value
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ FSTRING_END = next(_counter)
|
|||||||
tok_name[FSTRING_END] = 'FSTRING_END'
|
tok_name[FSTRING_END] = 'FSTRING_END'
|
||||||
FSTRING_STRING = next(_counter)
|
FSTRING_STRING = next(_counter)
|
||||||
tok_name[FSTRING_STRING] = 'FSTRING_STRING'
|
tok_name[FSTRING_STRING] = 'FSTRING_STRING'
|
||||||
|
EXCLAMATION = next(_counter)
|
||||||
|
tok_name[EXCLAMATION] = 'EXCLAMATION'
|
||||||
|
|
||||||
# Map from operator to number (since tokenize doesn't do this)
|
# Map from operator to number (since tokenize doesn't do this)
|
||||||
|
|
||||||
@@ -90,6 +92,7 @@ opmap_raw = """\
|
|||||||
//= DOUBLESLASHEQUAL
|
//= DOUBLESLASHEQUAL
|
||||||
-> RARROW
|
-> RARROW
|
||||||
... ELLIPSIS
|
... ELLIPSIS
|
||||||
|
! EXCLAMATION
|
||||||
"""
|
"""
|
||||||
|
|
||||||
opmap = {}
|
opmap = {}
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ from parso.utils import split_lines
|
|||||||
|
|
||||||
TokenCollection = namedtuple(
|
TokenCollection = namedtuple(
|
||||||
'TokenCollection',
|
'TokenCollection',
|
||||||
'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens',
|
'pseudo_token single_quoted triple_quoted endpats fstring_pattern_map always_break_tokens',
|
||||||
)
|
)
|
||||||
|
|
||||||
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
|
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
|
||||||
@@ -53,32 +53,35 @@ def group(*choices, **kwargs):
|
|||||||
return start + '|'.join(choices) + ')'
|
return start + '|'.join(choices) + ')'
|
||||||
|
|
||||||
|
|
||||||
def any(*choices):
|
|
||||||
return group(*choices) + '*'
|
|
||||||
|
|
||||||
|
|
||||||
def maybe(*choices):
|
def maybe(*choices):
|
||||||
return group(*choices) + '?'
|
return group(*choices) + '?'
|
||||||
|
|
||||||
|
|
||||||
# Return the empty string, plus all of the valid string prefixes.
|
# Return the empty string, plus all of the valid string prefixes.
|
||||||
def _all_string_prefixes(version_info, include_fstring=False):
|
def _all_string_prefixes(version_info, include_fstring=False, only_fstring=False):
|
||||||
def different_case_versions(prefix):
|
def different_case_versions(prefix):
|
||||||
for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
|
for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
|
||||||
yield ''.join(s)
|
yield ''.join(s)
|
||||||
# The valid string prefixes. Only contain the lower case versions,
|
# The valid string prefixes. Only contain the lower case versions,
|
||||||
# and don't contain any permuations (include 'fr', but not
|
# and don't contain any permuations (include 'fr', but not
|
||||||
# 'rf'). The various permutations will be generated.
|
# 'rf'). The various permutations will be generated.
|
||||||
_valid_string_prefixes = ['b', 'r', 'u']
|
valid_string_prefixes = ['b', 'r', 'u']
|
||||||
if version_info >= (3, 0):
|
if version_info >= (3, 0):
|
||||||
_valid_string_prefixes.append('br')
|
valid_string_prefixes.append('br')
|
||||||
|
|
||||||
|
result = {''}
|
||||||
if version_info >= (3, 6) and include_fstring:
|
if version_info >= (3, 6) and include_fstring:
|
||||||
_valid_string_prefixes += ['f', 'fr']
|
f = ['f', 'fr']
|
||||||
|
if only_fstring:
|
||||||
|
valid_string_prefixes = f
|
||||||
|
result = set()
|
||||||
|
else:
|
||||||
|
valid_string_prefixes += f
|
||||||
|
elif only_fstring:
|
||||||
|
return set()
|
||||||
|
|
||||||
# if we add binary f-strings, add: ['fb', 'fbr']
|
# if we add binary f-strings, add: ['fb', 'fbr']
|
||||||
result = set([''])
|
for prefix in valid_string_prefixes:
|
||||||
for prefix in _valid_string_prefixes:
|
|
||||||
for t in _itertools.permutations(prefix):
|
for t in _itertools.permutations(prefix):
|
||||||
# create a list with upper and lower versions of each
|
# create a list with upper and lower versions of each
|
||||||
# character
|
# character
|
||||||
@@ -183,10 +186,11 @@ def _create_token_collection(version_info):
|
|||||||
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
||||||
group('"', r'\\\r?\n'))
|
group('"', r'\\\r?\n'))
|
||||||
pseudo_extra_pool = [Comment, Triple]
|
pseudo_extra_pool = [Comment, Triple]
|
||||||
|
all_quotes = '"', "'", '"""', "'''"
|
||||||
if fstring_prefixes:
|
if fstring_prefixes:
|
||||||
pseudo_extra_pool.append(FStringStart)
|
pseudo_extra_pool.append(FStringStart + group(*all_quotes))
|
||||||
|
|
||||||
PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool)
|
PseudoExtras = group(r'\\\r?\n|\Z', *pseudo_extra_pool)
|
||||||
PseudoToken = group(Whitespace, capture=True) + \
|
PseudoToken = group(Whitespace, capture=True) + \
|
||||||
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
|
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
|
||||||
|
|
||||||
@@ -204,7 +208,7 @@ def _create_token_collection(version_info):
|
|||||||
# including the opening quotes.
|
# including the opening quotes.
|
||||||
single_quoted = set()
|
single_quoted = set()
|
||||||
triple_quoted = set()
|
triple_quoted = set()
|
||||||
fstring_endpats = {}
|
fstring_pattern_map = {}
|
||||||
for t in possible_prefixes:
|
for t in possible_prefixes:
|
||||||
for quote in '"', "'":
|
for quote in '"', "'":
|
||||||
single_quoted.add(t + quote)
|
single_quoted.add(t + quote)
|
||||||
@@ -213,15 +217,15 @@ def _create_token_collection(version_info):
|
|||||||
triple_quoted.add(t + quote)
|
triple_quoted.add(t + quote)
|
||||||
|
|
||||||
for t in fstring_prefixes:
|
for t in fstring_prefixes:
|
||||||
for quote in '"', "'", '"""', "'''":
|
for quote in all_quotes:
|
||||||
fstring_endpats[t + quote] = quote
|
fstring_pattern_map[t + quote] = quote
|
||||||
|
|
||||||
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
|
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
|
||||||
'finally', 'while', 'with', 'return')
|
'finally', 'while', 'with', 'return')
|
||||||
pseudo_token_compiled = _compile(PseudoToken)
|
pseudo_token_compiled = _compile(PseudoToken)
|
||||||
return TokenCollection(
|
return TokenCollection(
|
||||||
pseudo_token_compiled, single_quoted, triple_quoted, endpats,
|
pseudo_token_compiled, single_quoted, triple_quoted, endpats,
|
||||||
fstring_endpats, ALWAYS_BREAK_TOKENS
|
fstring_pattern_map, ALWAYS_BREAK_TOKENS
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -324,7 +328,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
token. This idea comes from lib2to3. The prefix contains all information
|
token. This idea comes from lib2to3. The prefix contains all information
|
||||||
that is irrelevant for the parser like newlines in parentheses or comments.
|
that is irrelevant for the parser like newlines in parentheses or comments.
|
||||||
"""
|
"""
|
||||||
pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \
|
pseudo_token, single_quoted, triple_quoted, endpats, fstring_pattern_map, always_break_tokens, = \
|
||||||
_get_token_collection(version_info)
|
_get_token_collection(version_info)
|
||||||
paren_level = 0 # count parentheses
|
paren_level = 0 # count parentheses
|
||||||
indents = [0]
|
indents = [0]
|
||||||
@@ -372,6 +376,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
while pos < max:
|
while pos < max:
|
||||||
|
assert not fstring_stack
|
||||||
if fstring_stack:
|
if fstring_stack:
|
||||||
string, pos = _find_fstring_string(fstring_stack, line, pos)
|
string, pos = _find_fstring_string(fstring_stack, line, pos)
|
||||||
if string:
|
if string:
|
||||||
@@ -474,8 +479,8 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
break
|
break
|
||||||
else: # ordinary string
|
else: # ordinary string
|
||||||
yield PythonToken(STRING, token, spos, prefix)
|
yield PythonToken(STRING, token, spos, prefix)
|
||||||
elif token in fstring_endpats:
|
elif token in fstring_pattern_map: # The start of an fstring.
|
||||||
fstring_stack.append(FStringNode(fstring_endpats[token]))
|
fstring_stack.append(FStringNode(fstring_pattern_map[token]))
|
||||||
yield PythonToken(FSTRING_START, token, spos, prefix)
|
yield PythonToken(FSTRING_START, token, spos, prefix)
|
||||||
elif is_identifier(initial): # ordinary name
|
elif is_identifier(initial): # ordinary name
|
||||||
if token in always_break_tokens:
|
if token in always_break_tokens:
|
||||||
|
|||||||
Reference in New Issue
Block a user