A first implementation of the fstring tokenizer

This commit is contained in:
Dave Halter
2018-03-30 20:50:49 +02:00
parent e05ce5ae31
commit d8d2e596a5
3 changed files with 144 additions and 26 deletions

View File

@@ -152,5 +152,6 @@ yield_arg: 'from' test | testlist
strings: (STRING | fstring)+
fstring: FSTRING_START fstring_content FSTRING_END
fstring_content: (FSTRING_STRING | fstring_expr)*
fstring_expr: '{' testlist [ FSTRING_CONVERSION ] [ fstring_format_spec ] '}'
fstring_conversion: '!' NAME
fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}'
fstring_format_spec: ':' fstring_content

View File

@@ -38,8 +38,6 @@ FSTRING_END = next(_counter)
tok_name[FSTRING_END] = 'FSTRING_END'
FSTRING_STRING = next(_counter)
tok_name[FSTRING_STRING] = 'FSTRING_STRING'
FSTRING_CONVERSION = next(_counter)
tok_name[FSTRING_CONVERSION] = 'FSTRING_CONVERSION'
# Map from operator to number (since tokenize doesn't do this)

View File

@@ -20,21 +20,15 @@ from codecs import BOM_UTF8
from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
ERROR_DEDENT)
ERROR_DEDENT, FSTRING_STRING, FSTRING_START,
FSTRING_END)
from parso._compatibility import py_version
from parso.utils import split_lines
#fstring_start = /[f|fr|rf]["|"""|'|''']/
#fstring_end = <same as the second part of the fstring start>
fstring_expr_start = ''
fstring_string = r'([^{}\n]+|\{\{|\}\})*'
fstring_conversion = r'![sra]'
TokenCollection = namedtuple(
'TokenCollection',
'pseudo_token single_quoted triple_quoted endpats always_break_tokens',
'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens',
)
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
@@ -68,7 +62,7 @@ def maybe(*choices):
# Return the empty string, plus all of the valid string prefixes.
def _all_string_prefixes(version_info):
def _all_string_prefixes(version_info, include_fstring=False):
def different_case_versions(prefix):
for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
yield ''.join(s)
@@ -79,7 +73,7 @@ def _all_string_prefixes(version_info):
if version_info >= (3, 0):
_valid_string_prefixes.append('br')
if version_info >= (3, 6):
if version_info >= (3, 6) and include_fstring:
_valid_string_prefixes += ['f', 'fr']
# if we add binary f-strings, add: ['fb', 'fbr']
@@ -109,6 +103,10 @@ def _get_token_collection(version_info):
return result
fstring_string_single_line = _compile(r'(?:[^{\r\n]+|\{\{)+')
fstring_string_multi_line = _compile(r'(?:[^{]+|\{\{)+')
def _create_token_collection(version_info):
# Note: we use unicode matching for names ("\w") but ascii matching for
# number literals.
@@ -148,6 +146,9 @@ def _create_token_collection(version_info):
# StringPrefix can be the empty string (making it optional).
possible_prefixes = _all_string_prefixes(version_info)
StringPrefix = group(*possible_prefixes)
StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))
fstring_prefixes = _all_string_prefixes(version_info, include_fstring=True, only_fstring=True)
FStringStart = group(*fstring_prefixes)
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@@ -157,7 +158,7 @@ def _create_token_collection(version_info):
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
@@ -181,7 +182,11 @@ def _create_token_collection(version_info):
group("'", r'\\\r?\n'),
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
pseudo_extra_pool = [Comment, Triple]
if fstring_prefixes:
pseudo_extra_pool.append(FStringStart)
PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool)
PseudoToken = group(Whitespace, capture=True) + \
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
@@ -199,18 +204,24 @@ def _create_token_collection(version_info):
# including the opening quotes.
single_quoted = set()
triple_quoted = set()
fstring_endpats = {}
for t in possible_prefixes:
for p in (t + '"', t + "'"):
single_quoted.add(p)
for p in (t + '"""', t + "'''"):
triple_quoted.add(p)
for quote in '"', "'":
single_quoted.add(t + quote)
for quote in '"""', "'''":
triple_quoted.add(t + quote)
for t in fstring_prefixes:
for quote in '"', "'", '"""', "'''":
fstring_endpats[t + quote] = quote
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
'finally', 'while', 'with', 'return')
pseudo_token_compiled = _compile(PseudoToken)
return TokenCollection(
pseudo_token_compiled, single_quoted, triple_quoted, endpats,
ALWAYS_BREAK_TOKENS
fstring_endpats, ALWAYS_BREAK_TOKENS
)
@@ -233,6 +244,72 @@ class PythonToken(Token):
self._replace(type=self._get_type_name()))
class FStringNode(object):
def __init__(self, quote):
self.quote = quote
self.parentheses_count = 0
self.previous_lines = ''
self.in_format_spec = False
def open_parentheses(self, character):
self.parentheses_count += 1
def close_parentheses(self, character):
self.parentheses_count -= 1
return self.parentheses_count == 0
def allow_multiline(self):
return len(self.quote == 3)
def is_in_expr(self):
return self.parentheses_count and not self.in_format_spec
def _check_fstring_ending(fstring_stack, token):
fstring_end = float('inf')
fstring_index = None
for i, node in enumerate(fstring_stack):
try:
end = token.index(node.quote)
except ValueError:
pass
else:
if fstring_index is None or end < fstring_end:
fstring_index = i
fstring_end = end
return fstring_index, fstring_end
def _find_fstring_string(fstring_stack, line, pos):
tos = fstring_stack[-1]
if tos.is_in_expr():
return '', pos
else:
new_pos = pos
allow_multiline = tos.allow_multiline()
if allow_multiline:
match = fstring_string_multi_line.match(line, pos)
else:
match = fstring_string_single_line.match(line, pos)
if match is None:
string = fstring_stack.previous_lines
else:
string = match.group(0)
for fstring_stack_node in fstring_stack:
try:
string = string[:string.index(fstring_stack_node.quote)]
except ValueError:
pass # The string was not found.
new_pos += len(string)
if allow_multiline and string.endswith('\n'):
fstring_stack.previous_lines += string
string = ''
else:
string = fstring_stack_node.previous_lines + string
return string, new_pos
def tokenize(code, version_info, start_pos=(1, 0)):
"""Generate tokens from a the source code (string)."""
lines = split_lines(code, keepends=True)
@@ -247,7 +324,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
token. This idea comes from lib2to3. The prefix contains all information
that is irrelevant for the parser like newlines in parentheses or comments.
"""
pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \
pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \
_get_token_collection(version_info)
paren_level = 0 # count parentheses
indents = [0]
@@ -264,6 +341,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
additional_prefix = ''
first = True
lnum = start_pos[0] - 1
fstring_stack = []
for line in lines: # loop over lines in stream
lnum += 1
pos = 0
@@ -294,6 +372,13 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
continue
while pos < max:
if fstring_stack:
string, pos = _find_fstring_string(fstring_stack, line, pos)
if string:
fstring_stack.previous_lines = ''
yield PythonToken(FSTRING_STRING, string, (lnum, pos), '')
continue
pseudomatch = pseudo_token.match(line, pos)
if not pseudomatch: # scan for tokens
txt = line[pos:]
@@ -318,10 +403,11 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
if new_line and initial not in '\r\n#':
new_line = False
if paren_level == 0:
if paren_level == 0 and not fstring_stack:
i = 0
while line[i] == '\f':
i += 1
# TODO don't we need to change spos as well?
start -= 1
if start > indents[-1]:
yield PythonToken(INDENT, '', spos, '')
@@ -333,11 +419,30 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
yield PythonToken(DEDENT, '', spos, '')
indents.pop()
if fstring_stack:
fstring_index, end = _check_fstring_ending(fstring_stack, token)
if fstring_index is not None:
if end != 0:
yield PythonToken(ERRORTOKEN, token[:end], spos, prefix)
yield PythonToken(
FSTRING_END,
fstring_stack[fstring_index].quote,
(lnum, spos[1] + 1),
prefix=''
)
del fstring_index[fstring_index:]
pos -= len(token) - end
continue
if (initial in numchars or # ordinary number
(initial == '.' and token != '.' and token != '...')):
yield PythonToken(NUMBER, token, spos, prefix)
elif initial in '\r\n':
if not new_line and paren_level == 0:
if any(not f.allow_multiline() for f in fstring_stack):
fstring_stack.clear()
if not new_line and paren_level == 0 and not fstring_stack:
yield PythonToken(NEWLINE, token, spos, prefix)
else:
additional_prefix = prefix + token
@@ -369,8 +474,12 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
break
else: # ordinary string
yield PythonToken(STRING, token, spos, prefix)
elif token in fstring_endpats:
fstring_stack.append(FStringNode(fstring_endpats[token]))
yield PythonToken(FSTRING_START, token, spos, prefix)
elif is_identifier(initial): # ordinary name
if token in always_break_tokens:
fstring_stack.clear()
paren_level = 0
while True:
indent = indents.pop()
@@ -385,9 +494,19 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
break
else:
if token in '([{':
paren_level += 1
if fstring_stack:
fstring_stack[-1].open_bracket(token)
else:
paren_level += 1
elif token in ')]}':
paren_level -= 1
if fstring_stack:
if fstring_stack[-1].close_parentheses(token):
fstring_stack.pop()
else:
paren_level -= 1
elif token == ':' and fstring_stack \
and fstring_stack[-1].parentheses_count == 1:
fstring_stack[-1].in_format_spec = True
try:
# This check is needed in any case to check if it's a valid