A first implementation of the fstring tokenizer

This commit is contained in:
Dave Halter
2018-03-30 20:50:49 +02:00
parent e05ce5ae31
commit d8d2e596a5
3 changed files with 144 additions and 26 deletions

View File

@@ -152,5 +152,6 @@ yield_arg: 'from' test | testlist
strings: (STRING | fstring)+ strings: (STRING | fstring)+
fstring: FSTRING_START fstring_content FSTRING_END fstring: FSTRING_START fstring_content FSTRING_END
fstring_content: (FSTRING_STRING | fstring_expr)* fstring_content: (FSTRING_STRING | fstring_expr)*
fstring_expr: '{' testlist [ FSTRING_CONVERSION ] [ fstring_format_spec ] '}' fstring_conversion: '!' NAME
fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}'
fstring_format_spec: ':' fstring_content fstring_format_spec: ':' fstring_content

View File

@@ -38,8 +38,6 @@ FSTRING_END = next(_counter)
tok_name[FSTRING_END] = 'FSTRING_END' tok_name[FSTRING_END] = 'FSTRING_END'
FSTRING_STRING = next(_counter) FSTRING_STRING = next(_counter)
tok_name[FSTRING_STRING] = 'FSTRING_STRING' tok_name[FSTRING_STRING] = 'FSTRING_STRING'
FSTRING_CONVERSION = next(_counter)
tok_name[FSTRING_CONVERSION] = 'FSTRING_CONVERSION'
# Map from operator to number (since tokenize doesn't do this) # Map from operator to number (since tokenize doesn't do this)

View File

@@ -20,21 +20,15 @@ from codecs import BOM_UTF8
from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap, from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT, NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
ERROR_DEDENT) ERROR_DEDENT, FSTRING_STRING, FSTRING_START,
FSTRING_END)
from parso._compatibility import py_version from parso._compatibility import py_version
from parso.utils import split_lines from parso.utils import split_lines
#fstring_start = /[f|fr|rf]["|"""|'|''']/
#fstring_end = <same as the second part of the fstring start>
fstring_expr_start = ''
fstring_string = r'([^{}\n]+|\{\{|\}\})*'
fstring_conversion = r'![sra]'
TokenCollection = namedtuple( TokenCollection = namedtuple(
'TokenCollection', 'TokenCollection',
'pseudo_token single_quoted triple_quoted endpats always_break_tokens', 'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens',
) )
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
@@ -68,7 +62,7 @@ def maybe(*choices):
# Return the empty string, plus all of the valid string prefixes. # Return the empty string, plus all of the valid string prefixes.
def _all_string_prefixes(version_info): def _all_string_prefixes(version_info, include_fstring=False):
def different_case_versions(prefix): def different_case_versions(prefix):
for s in _itertools.product(*[(c, c.upper()) for c in prefix]): for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
yield ''.join(s) yield ''.join(s)
@@ -79,7 +73,7 @@ def _all_string_prefixes(version_info):
if version_info >= (3, 0): if version_info >= (3, 0):
_valid_string_prefixes.append('br') _valid_string_prefixes.append('br')
if version_info >= (3, 6): if version_info >= (3, 6) and include_fstring:
_valid_string_prefixes += ['f', 'fr'] _valid_string_prefixes += ['f', 'fr']
# if we add binary f-strings, add: ['fb', 'fbr'] # if we add binary f-strings, add: ['fb', 'fbr']
@@ -109,6 +103,10 @@ def _get_token_collection(version_info):
return result return result
fstring_string_single_line = _compile(r'(?:[^{\r\n]+|\{\{)+')
fstring_string_multi_line = _compile(r'(?:[^{]+|\{\{)+')
def _create_token_collection(version_info): def _create_token_collection(version_info):
# Note: we use unicode matching for names ("\w") but ascii matching for # Note: we use unicode matching for names ("\w") but ascii matching for
# number literals. # number literals.
@@ -148,6 +146,9 @@ def _create_token_collection(version_info):
# StringPrefix can be the empty string (making it optional). # StringPrefix can be the empty string (making it optional).
possible_prefixes = _all_string_prefixes(version_info) possible_prefixes = _all_string_prefixes(version_info)
StringPrefix = group(*possible_prefixes) StringPrefix = group(*possible_prefixes)
StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))
fstring_prefixes = _all_string_prefixes(version_info, include_fstring=True, only_fstring=True)
FStringStart = group(*fstring_prefixes)
# Tail end of ' string. # Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'" Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@@ -157,7 +158,7 @@ def _create_token_collection(version_info):
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string. # Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Triple = group(StringPrefix + "'''", StringPrefix + '"""') Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')
# Because of leftmost-then-longest match semantics, be sure to put the # Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get # longest operators first (e.g., if = came before ==, == would get
@@ -181,7 +182,11 @@ def _create_token_collection(version_info):
group("'", r'\\\r?\n'), group("'", r'\\\r?\n'),
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n')) group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) pseudo_extra_pool = [Comment, Triple]
if fstring_prefixes:
pseudo_extra_pool.append(FStringStart)
PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool)
PseudoToken = group(Whitespace, capture=True) + \ PseudoToken = group(Whitespace, capture=True) + \
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
@@ -199,18 +204,24 @@ def _create_token_collection(version_info):
# including the opening quotes. # including the opening quotes.
single_quoted = set() single_quoted = set()
triple_quoted = set() triple_quoted = set()
fstring_endpats = {}
for t in possible_prefixes: for t in possible_prefixes:
for p in (t + '"', t + "'"): for quote in '"', "'":
single_quoted.add(p) single_quoted.add(t + quote)
for p in (t + '"""', t + "'''"):
triple_quoted.add(p) for quote in '"""', "'''":
triple_quoted.add(t + quote)
for t in fstring_prefixes:
for quote in '"', "'", '"""', "'''":
fstring_endpats[t + quote] = quote
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
'finally', 'while', 'with', 'return') 'finally', 'while', 'with', 'return')
pseudo_token_compiled = _compile(PseudoToken) pseudo_token_compiled = _compile(PseudoToken)
return TokenCollection( return TokenCollection(
pseudo_token_compiled, single_quoted, triple_quoted, endpats, pseudo_token_compiled, single_quoted, triple_quoted, endpats,
ALWAYS_BREAK_TOKENS fstring_endpats, ALWAYS_BREAK_TOKENS
) )
@@ -233,6 +244,72 @@ class PythonToken(Token):
self._replace(type=self._get_type_name())) self._replace(type=self._get_type_name()))
class FStringNode(object):
def __init__(self, quote):
self.quote = quote
self.parentheses_count = 0
self.previous_lines = ''
self.in_format_spec = False
def open_parentheses(self, character):
self.parentheses_count += 1
def close_parentheses(self, character):
self.parentheses_count -= 1
return self.parentheses_count == 0
def allow_multiline(self):
return len(self.quote == 3)
def is_in_expr(self):
return self.parentheses_count and not self.in_format_spec
def _check_fstring_ending(fstring_stack, token):
fstring_end = float('inf')
fstring_index = None
for i, node in enumerate(fstring_stack):
try:
end = token.index(node.quote)
except ValueError:
pass
else:
if fstring_index is None or end < fstring_end:
fstring_index = i
fstring_end = end
return fstring_index, fstring_end
def _find_fstring_string(fstring_stack, line, pos):
tos = fstring_stack[-1]
if tos.is_in_expr():
return '', pos
else:
new_pos = pos
allow_multiline = tos.allow_multiline()
if allow_multiline:
match = fstring_string_multi_line.match(line, pos)
else:
match = fstring_string_single_line.match(line, pos)
if match is None:
string = fstring_stack.previous_lines
else:
string = match.group(0)
for fstring_stack_node in fstring_stack:
try:
string = string[:string.index(fstring_stack_node.quote)]
except ValueError:
pass # The string was not found.
new_pos += len(string)
if allow_multiline and string.endswith('\n'):
fstring_stack.previous_lines += string
string = ''
else:
string = fstring_stack_node.previous_lines + string
return string, new_pos
def tokenize(code, version_info, start_pos=(1, 0)): def tokenize(code, version_info, start_pos=(1, 0)):
"""Generate tokens from a the source code (string).""" """Generate tokens from a the source code (string)."""
lines = split_lines(code, keepends=True) lines = split_lines(code, keepends=True)
@@ -247,7 +324,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
token. This idea comes from lib2to3. The prefix contains all information token. This idea comes from lib2to3. The prefix contains all information
that is irrelevant for the parser like newlines in parentheses or comments. that is irrelevant for the parser like newlines in parentheses or comments.
""" """
pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \ pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \
_get_token_collection(version_info) _get_token_collection(version_info)
paren_level = 0 # count parentheses paren_level = 0 # count parentheses
indents = [0] indents = [0]
@@ -264,6 +341,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
additional_prefix = '' additional_prefix = ''
first = True first = True
lnum = start_pos[0] - 1 lnum = start_pos[0] - 1
fstring_stack = []
for line in lines: # loop over lines in stream for line in lines: # loop over lines in stream
lnum += 1 lnum += 1
pos = 0 pos = 0
@@ -294,6 +372,13 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
continue continue
while pos < max: while pos < max:
if fstring_stack:
string, pos = _find_fstring_string(fstring_stack, line, pos)
if string:
fstring_stack.previous_lines = ''
yield PythonToken(FSTRING_STRING, string, (lnum, pos), '')
continue
pseudomatch = pseudo_token.match(line, pos) pseudomatch = pseudo_token.match(line, pos)
if not pseudomatch: # scan for tokens if not pseudomatch: # scan for tokens
txt = line[pos:] txt = line[pos:]
@@ -318,10 +403,11 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
if new_line and initial not in '\r\n#': if new_line and initial not in '\r\n#':
new_line = False new_line = False
if paren_level == 0: if paren_level == 0 and not fstring_stack:
i = 0 i = 0
while line[i] == '\f': while line[i] == '\f':
i += 1 i += 1
# TODO don't we need to change spos as well?
start -= 1 start -= 1
if start > indents[-1]: if start > indents[-1]:
yield PythonToken(INDENT, '', spos, '') yield PythonToken(INDENT, '', spos, '')
@@ -333,11 +419,30 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
yield PythonToken(DEDENT, '', spos, '') yield PythonToken(DEDENT, '', spos, '')
indents.pop() indents.pop()
if fstring_stack:
fstring_index, end = _check_fstring_ending(fstring_stack, token)
if fstring_index is not None:
if end != 0:
yield PythonToken(ERRORTOKEN, token[:end], spos, prefix)
yield PythonToken(
FSTRING_END,
fstring_stack[fstring_index].quote,
(lnum, spos[1] + 1),
prefix=''
)
del fstring_index[fstring_index:]
pos -= len(token) - end
continue
if (initial in numchars or # ordinary number if (initial in numchars or # ordinary number
(initial == '.' and token != '.' and token != '...')): (initial == '.' and token != '.' and token != '...')):
yield PythonToken(NUMBER, token, spos, prefix) yield PythonToken(NUMBER, token, spos, prefix)
elif initial in '\r\n': elif initial in '\r\n':
if not new_line and paren_level == 0: if any(not f.allow_multiline() for f in fstring_stack):
fstring_stack.clear()
if not new_line and paren_level == 0 and not fstring_stack:
yield PythonToken(NEWLINE, token, spos, prefix) yield PythonToken(NEWLINE, token, spos, prefix)
else: else:
additional_prefix = prefix + token additional_prefix = prefix + token
@@ -369,8 +474,12 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
break break
else: # ordinary string else: # ordinary string
yield PythonToken(STRING, token, spos, prefix) yield PythonToken(STRING, token, spos, prefix)
elif token in fstring_endpats:
fstring_stack.append(FStringNode(fstring_endpats[token]))
yield PythonToken(FSTRING_START, token, spos, prefix)
elif is_identifier(initial): # ordinary name elif is_identifier(initial): # ordinary name
if token in always_break_tokens: if token in always_break_tokens:
fstring_stack.clear()
paren_level = 0 paren_level = 0
while True: while True:
indent = indents.pop() indent = indents.pop()
@@ -385,9 +494,19 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
break break
else: else:
if token in '([{': if token in '([{':
paren_level += 1 if fstring_stack:
fstring_stack[-1].open_bracket(token)
else:
paren_level += 1
elif token in ')]}': elif token in ')]}':
paren_level -= 1 if fstring_stack:
if fstring_stack[-1].close_parentheses(token):
fstring_stack.pop()
else:
paren_level -= 1
elif token == ':' and fstring_stack \
and fstring_stack[-1].parentheses_count == 1:
fstring_stack[-1].in_format_spec = True
try: try:
# This check is needed in any case to check if it's a valid # This check is needed in any case to check if it's a valid