mirror of
https://github.com/davidhalter/parso.git
synced 2026-01-09 04:52:42 +08:00
A first implementation of the fstring tokenizer
This commit is contained in:
@@ -152,5 +152,6 @@ yield_arg: 'from' test | testlist
|
|||||||
strings: (STRING | fstring)+
|
strings: (STRING | fstring)+
|
||||||
fstring: FSTRING_START fstring_content FSTRING_END
|
fstring: FSTRING_START fstring_content FSTRING_END
|
||||||
fstring_content: (FSTRING_STRING | fstring_expr)*
|
fstring_content: (FSTRING_STRING | fstring_expr)*
|
||||||
fstring_expr: '{' testlist [ FSTRING_CONVERSION ] [ fstring_format_spec ] '}'
|
fstring_conversion: '!' NAME
|
||||||
|
fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}'
|
||||||
fstring_format_spec: ':' fstring_content
|
fstring_format_spec: ':' fstring_content
|
||||||
|
|||||||
@@ -38,8 +38,6 @@ FSTRING_END = next(_counter)
|
|||||||
tok_name[FSTRING_END] = 'FSTRING_END'
|
tok_name[FSTRING_END] = 'FSTRING_END'
|
||||||
FSTRING_STRING = next(_counter)
|
FSTRING_STRING = next(_counter)
|
||||||
tok_name[FSTRING_STRING] = 'FSTRING_STRING'
|
tok_name[FSTRING_STRING] = 'FSTRING_STRING'
|
||||||
FSTRING_CONVERSION = next(_counter)
|
|
||||||
tok_name[FSTRING_CONVERSION] = 'FSTRING_CONVERSION'
|
|
||||||
|
|
||||||
# Map from operator to number (since tokenize doesn't do this)
|
# Map from operator to number (since tokenize doesn't do this)
|
||||||
|
|
||||||
|
|||||||
@@ -20,21 +20,15 @@ from codecs import BOM_UTF8
|
|||||||
|
|
||||||
from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
|
from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
|
||||||
NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
|
NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
|
||||||
ERROR_DEDENT)
|
ERROR_DEDENT, FSTRING_STRING, FSTRING_START,
|
||||||
|
FSTRING_END)
|
||||||
from parso._compatibility import py_version
|
from parso._compatibility import py_version
|
||||||
from parso.utils import split_lines
|
from parso.utils import split_lines
|
||||||
|
|
||||||
|
|
||||||
#fstring_start = /[f|fr|rf]["|"""|'|''']/
|
|
||||||
#fstring_end = <same as the second part of the fstring start>
|
|
||||||
fstring_expr_start = ''
|
|
||||||
fstring_string = r'([^{}\n]+|\{\{|\}\})*'
|
|
||||||
fstring_conversion = r'![sra]'
|
|
||||||
|
|
||||||
|
|
||||||
TokenCollection = namedtuple(
|
TokenCollection = namedtuple(
|
||||||
'TokenCollection',
|
'TokenCollection',
|
||||||
'pseudo_token single_quoted triple_quoted endpats always_break_tokens',
|
'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens',
|
||||||
)
|
)
|
||||||
|
|
||||||
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
|
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
|
||||||
@@ -68,7 +62,7 @@ def maybe(*choices):
|
|||||||
|
|
||||||
|
|
||||||
# Return the empty string, plus all of the valid string prefixes.
|
# Return the empty string, plus all of the valid string prefixes.
|
||||||
def _all_string_prefixes(version_info):
|
def _all_string_prefixes(version_info, include_fstring=False):
|
||||||
def different_case_versions(prefix):
|
def different_case_versions(prefix):
|
||||||
for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
|
for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
|
||||||
yield ''.join(s)
|
yield ''.join(s)
|
||||||
@@ -79,7 +73,7 @@ def _all_string_prefixes(version_info):
|
|||||||
if version_info >= (3, 0):
|
if version_info >= (3, 0):
|
||||||
_valid_string_prefixes.append('br')
|
_valid_string_prefixes.append('br')
|
||||||
|
|
||||||
if version_info >= (3, 6):
|
if version_info >= (3, 6) and include_fstring:
|
||||||
_valid_string_prefixes += ['f', 'fr']
|
_valid_string_prefixes += ['f', 'fr']
|
||||||
|
|
||||||
# if we add binary f-strings, add: ['fb', 'fbr']
|
# if we add binary f-strings, add: ['fb', 'fbr']
|
||||||
@@ -109,6 +103,10 @@ def _get_token_collection(version_info):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
fstring_string_single_line = _compile(r'(?:[^{\r\n]+|\{\{)+')
|
||||||
|
fstring_string_multi_line = _compile(r'(?:[^{]+|\{\{)+')
|
||||||
|
|
||||||
|
|
||||||
def _create_token_collection(version_info):
|
def _create_token_collection(version_info):
|
||||||
# Note: we use unicode matching for names ("\w") but ascii matching for
|
# Note: we use unicode matching for names ("\w") but ascii matching for
|
||||||
# number literals.
|
# number literals.
|
||||||
@@ -148,6 +146,9 @@ def _create_token_collection(version_info):
|
|||||||
# StringPrefix can be the empty string (making it optional).
|
# StringPrefix can be the empty string (making it optional).
|
||||||
possible_prefixes = _all_string_prefixes(version_info)
|
possible_prefixes = _all_string_prefixes(version_info)
|
||||||
StringPrefix = group(*possible_prefixes)
|
StringPrefix = group(*possible_prefixes)
|
||||||
|
StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))
|
||||||
|
fstring_prefixes = _all_string_prefixes(version_info, include_fstring=True, only_fstring=True)
|
||||||
|
FStringStart = group(*fstring_prefixes)
|
||||||
|
|
||||||
# Tail end of ' string.
|
# Tail end of ' string.
|
||||||
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
|
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
|
||||||
@@ -157,7 +158,7 @@ def _create_token_collection(version_info):
|
|||||||
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
|
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
|
||||||
# Tail end of """ string.
|
# Tail end of """ string.
|
||||||
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
|
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
|
||||||
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
|
Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')
|
||||||
|
|
||||||
# Because of leftmost-then-longest match semantics, be sure to put the
|
# Because of leftmost-then-longest match semantics, be sure to put the
|
||||||
# longest operators first (e.g., if = came before ==, == would get
|
# longest operators first (e.g., if = came before ==, == would get
|
||||||
@@ -181,7 +182,11 @@ def _create_token_collection(version_info):
|
|||||||
group("'", r'\\\r?\n'),
|
group("'", r'\\\r?\n'),
|
||||||
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
||||||
group('"', r'\\\r?\n'))
|
group('"', r'\\\r?\n'))
|
||||||
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
|
pseudo_extra_pool = [Comment, Triple]
|
||||||
|
if fstring_prefixes:
|
||||||
|
pseudo_extra_pool.append(FStringStart)
|
||||||
|
|
||||||
|
PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool)
|
||||||
PseudoToken = group(Whitespace, capture=True) + \
|
PseudoToken = group(Whitespace, capture=True) + \
|
||||||
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
|
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
|
||||||
|
|
||||||
@@ -199,18 +204,24 @@ def _create_token_collection(version_info):
|
|||||||
# including the opening quotes.
|
# including the opening quotes.
|
||||||
single_quoted = set()
|
single_quoted = set()
|
||||||
triple_quoted = set()
|
triple_quoted = set()
|
||||||
|
fstring_endpats = {}
|
||||||
for t in possible_prefixes:
|
for t in possible_prefixes:
|
||||||
for p in (t + '"', t + "'"):
|
for quote in '"', "'":
|
||||||
single_quoted.add(p)
|
single_quoted.add(t + quote)
|
||||||
for p in (t + '"""', t + "'''"):
|
|
||||||
triple_quoted.add(p)
|
for quote in '"""', "'''":
|
||||||
|
triple_quoted.add(t + quote)
|
||||||
|
|
||||||
|
for t in fstring_prefixes:
|
||||||
|
for quote in '"', "'", '"""', "'''":
|
||||||
|
fstring_endpats[t + quote] = quote
|
||||||
|
|
||||||
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
|
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
|
||||||
'finally', 'while', 'with', 'return')
|
'finally', 'while', 'with', 'return')
|
||||||
pseudo_token_compiled = _compile(PseudoToken)
|
pseudo_token_compiled = _compile(PseudoToken)
|
||||||
return TokenCollection(
|
return TokenCollection(
|
||||||
pseudo_token_compiled, single_quoted, triple_quoted, endpats,
|
pseudo_token_compiled, single_quoted, triple_quoted, endpats,
|
||||||
ALWAYS_BREAK_TOKENS
|
fstring_endpats, ALWAYS_BREAK_TOKENS
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -233,6 +244,72 @@ class PythonToken(Token):
|
|||||||
self._replace(type=self._get_type_name()))
|
self._replace(type=self._get_type_name()))
|
||||||
|
|
||||||
|
|
||||||
|
class FStringNode(object):
|
||||||
|
def __init__(self, quote):
|
||||||
|
self.quote = quote
|
||||||
|
self.parentheses_count = 0
|
||||||
|
self.previous_lines = ''
|
||||||
|
self.in_format_spec = False
|
||||||
|
|
||||||
|
def open_parentheses(self, character):
|
||||||
|
self.parentheses_count += 1
|
||||||
|
|
||||||
|
def close_parentheses(self, character):
|
||||||
|
self.parentheses_count -= 1
|
||||||
|
return self.parentheses_count == 0
|
||||||
|
|
||||||
|
def allow_multiline(self):
|
||||||
|
return len(self.quote == 3)
|
||||||
|
|
||||||
|
def is_in_expr(self):
|
||||||
|
return self.parentheses_count and not self.in_format_spec
|
||||||
|
|
||||||
|
|
||||||
|
def _check_fstring_ending(fstring_stack, token):
|
||||||
|
fstring_end = float('inf')
|
||||||
|
fstring_index = None
|
||||||
|
for i, node in enumerate(fstring_stack):
|
||||||
|
try:
|
||||||
|
end = token.index(node.quote)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if fstring_index is None or end < fstring_end:
|
||||||
|
fstring_index = i
|
||||||
|
fstring_end = end
|
||||||
|
return fstring_index, fstring_end
|
||||||
|
|
||||||
|
|
||||||
|
def _find_fstring_string(fstring_stack, line, pos):
|
||||||
|
tos = fstring_stack[-1]
|
||||||
|
if tos.is_in_expr():
|
||||||
|
return '', pos
|
||||||
|
else:
|
||||||
|
new_pos = pos
|
||||||
|
allow_multiline = tos.allow_multiline()
|
||||||
|
if allow_multiline:
|
||||||
|
match = fstring_string_multi_line.match(line, pos)
|
||||||
|
else:
|
||||||
|
match = fstring_string_single_line.match(line, pos)
|
||||||
|
if match is None:
|
||||||
|
string = fstring_stack.previous_lines
|
||||||
|
else:
|
||||||
|
string = match.group(0)
|
||||||
|
for fstring_stack_node in fstring_stack:
|
||||||
|
try:
|
||||||
|
string = string[:string.index(fstring_stack_node.quote)]
|
||||||
|
except ValueError:
|
||||||
|
pass # The string was not found.
|
||||||
|
|
||||||
|
new_pos += len(string)
|
||||||
|
if allow_multiline and string.endswith('\n'):
|
||||||
|
fstring_stack.previous_lines += string
|
||||||
|
string = ''
|
||||||
|
else:
|
||||||
|
string = fstring_stack_node.previous_lines + string
|
||||||
|
return string, new_pos
|
||||||
|
|
||||||
|
|
||||||
def tokenize(code, version_info, start_pos=(1, 0)):
|
def tokenize(code, version_info, start_pos=(1, 0)):
|
||||||
"""Generate tokens from a the source code (string)."""
|
"""Generate tokens from a the source code (string)."""
|
||||||
lines = split_lines(code, keepends=True)
|
lines = split_lines(code, keepends=True)
|
||||||
@@ -247,7 +324,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
token. This idea comes from lib2to3. The prefix contains all information
|
token. This idea comes from lib2to3. The prefix contains all information
|
||||||
that is irrelevant for the parser like newlines in parentheses or comments.
|
that is irrelevant for the parser like newlines in parentheses or comments.
|
||||||
"""
|
"""
|
||||||
pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \
|
pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \
|
||||||
_get_token_collection(version_info)
|
_get_token_collection(version_info)
|
||||||
paren_level = 0 # count parentheses
|
paren_level = 0 # count parentheses
|
||||||
indents = [0]
|
indents = [0]
|
||||||
@@ -264,6 +341,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
additional_prefix = ''
|
additional_prefix = ''
|
||||||
first = True
|
first = True
|
||||||
lnum = start_pos[0] - 1
|
lnum = start_pos[0] - 1
|
||||||
|
fstring_stack = []
|
||||||
for line in lines: # loop over lines in stream
|
for line in lines: # loop over lines in stream
|
||||||
lnum += 1
|
lnum += 1
|
||||||
pos = 0
|
pos = 0
|
||||||
@@ -294,6 +372,13 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
while pos < max:
|
while pos < max:
|
||||||
|
if fstring_stack:
|
||||||
|
string, pos = _find_fstring_string(fstring_stack, line, pos)
|
||||||
|
if string:
|
||||||
|
fstring_stack.previous_lines = ''
|
||||||
|
yield PythonToken(FSTRING_STRING, string, (lnum, pos), '')
|
||||||
|
continue
|
||||||
|
|
||||||
pseudomatch = pseudo_token.match(line, pos)
|
pseudomatch = pseudo_token.match(line, pos)
|
||||||
if not pseudomatch: # scan for tokens
|
if not pseudomatch: # scan for tokens
|
||||||
txt = line[pos:]
|
txt = line[pos:]
|
||||||
@@ -318,10 +403,11 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
|
|
||||||
if new_line and initial not in '\r\n#':
|
if new_line and initial not in '\r\n#':
|
||||||
new_line = False
|
new_line = False
|
||||||
if paren_level == 0:
|
if paren_level == 0 and not fstring_stack:
|
||||||
i = 0
|
i = 0
|
||||||
while line[i] == '\f':
|
while line[i] == '\f':
|
||||||
i += 1
|
i += 1
|
||||||
|
# TODO don't we need to change spos as well?
|
||||||
start -= 1
|
start -= 1
|
||||||
if start > indents[-1]:
|
if start > indents[-1]:
|
||||||
yield PythonToken(INDENT, '', spos, '')
|
yield PythonToken(INDENT, '', spos, '')
|
||||||
@@ -333,11 +419,30 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
yield PythonToken(DEDENT, '', spos, '')
|
yield PythonToken(DEDENT, '', spos, '')
|
||||||
indents.pop()
|
indents.pop()
|
||||||
|
|
||||||
|
if fstring_stack:
|
||||||
|
fstring_index, end = _check_fstring_ending(fstring_stack, token)
|
||||||
|
if fstring_index is not None:
|
||||||
|
if end != 0:
|
||||||
|
yield PythonToken(ERRORTOKEN, token[:end], spos, prefix)
|
||||||
|
|
||||||
|
yield PythonToken(
|
||||||
|
FSTRING_END,
|
||||||
|
fstring_stack[fstring_index].quote,
|
||||||
|
(lnum, spos[1] + 1),
|
||||||
|
prefix=''
|
||||||
|
)
|
||||||
|
del fstring_index[fstring_index:]
|
||||||
|
pos -= len(token) - end
|
||||||
|
continue
|
||||||
|
|
||||||
if (initial in numchars or # ordinary number
|
if (initial in numchars or # ordinary number
|
||||||
(initial == '.' and token != '.' and token != '...')):
|
(initial == '.' and token != '.' and token != '...')):
|
||||||
yield PythonToken(NUMBER, token, spos, prefix)
|
yield PythonToken(NUMBER, token, spos, prefix)
|
||||||
elif initial in '\r\n':
|
elif initial in '\r\n':
|
||||||
if not new_line and paren_level == 0:
|
if any(not f.allow_multiline() for f in fstring_stack):
|
||||||
|
fstring_stack.clear()
|
||||||
|
|
||||||
|
if not new_line and paren_level == 0 and not fstring_stack:
|
||||||
yield PythonToken(NEWLINE, token, spos, prefix)
|
yield PythonToken(NEWLINE, token, spos, prefix)
|
||||||
else:
|
else:
|
||||||
additional_prefix = prefix + token
|
additional_prefix = prefix + token
|
||||||
@@ -369,8 +474,12 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
break
|
break
|
||||||
else: # ordinary string
|
else: # ordinary string
|
||||||
yield PythonToken(STRING, token, spos, prefix)
|
yield PythonToken(STRING, token, spos, prefix)
|
||||||
|
elif token in fstring_endpats:
|
||||||
|
fstring_stack.append(FStringNode(fstring_endpats[token]))
|
||||||
|
yield PythonToken(FSTRING_START, token, spos, prefix)
|
||||||
elif is_identifier(initial): # ordinary name
|
elif is_identifier(initial): # ordinary name
|
||||||
if token in always_break_tokens:
|
if token in always_break_tokens:
|
||||||
|
fstring_stack.clear()
|
||||||
paren_level = 0
|
paren_level = 0
|
||||||
while True:
|
while True:
|
||||||
indent = indents.pop()
|
indent = indents.pop()
|
||||||
@@ -385,9 +494,19 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
if token in '([{':
|
if token in '([{':
|
||||||
paren_level += 1
|
if fstring_stack:
|
||||||
|
fstring_stack[-1].open_bracket(token)
|
||||||
|
else:
|
||||||
|
paren_level += 1
|
||||||
elif token in ')]}':
|
elif token in ')]}':
|
||||||
paren_level -= 1
|
if fstring_stack:
|
||||||
|
if fstring_stack[-1].close_parentheses(token):
|
||||||
|
fstring_stack.pop()
|
||||||
|
else:
|
||||||
|
paren_level -= 1
|
||||||
|
elif token == ':' and fstring_stack \
|
||||||
|
and fstring_stack[-1].parentheses_count == 1:
|
||||||
|
fstring_stack[-1].in_format_spec = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# This check is needed in any case to check if it's a valid
|
# This check is needed in any case to check if it's a valid
|
||||||
|
|||||||
Reference in New Issue
Block a user