A first implementation of the fstring tokenizer

2026-03-01 21:07:28 +08:00 · 2018-03-30 20:50:49 +02:00
parent e05ce5ae31
commit d8d2e596a5
3 changed files with 144 additions and 26 deletions
--- a/parso/python/grammar36.txt
+++ b/parso/python/grammar36.txt
@@ -152,5 +152,6 @@ yield_arg: 'from' test | testlist
 strings: (STRING | fstring)+
 fstring: FSTRING_START fstring_content FSTRING_END
 fstring_content: (FSTRING_STRING | fstring_expr)*
-fstring_expr: '{' testlist  [ FSTRING_CONVERSION ] [ fstring_format_spec ] '}'
+fstring_conversion: '!' NAME
 fstring_expr: '{' testlist  [ fstring_conversion ] [ fstring_format_spec ] '}'
 fstring_format_spec: ':' fstring_content
--- a/parso/python/token.py
+++ b/parso/python/token.py
@@ -38,8 +38,6 @@ FSTRING_END = next(_counter)
 tok_name[FSTRING_END] = 'FSTRING_END'
 FSTRING_STRING = next(_counter)
 tok_name[FSTRING_STRING] = 'FSTRING_STRING'
 FSTRING_CONVERSION = next(_counter)
 tok_name[FSTRING_CONVERSION] = 'FSTRING_CONVERSION'
 # Map from operator to number (since tokenize doesn't do this)
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -20,21 +20,15 @@ from codecs import BOM_UTF8
 from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
                                NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
-                                ERROR_DEDENT)
+                                ERROR_DEDENT, FSTRING_STRING, FSTRING_START,
                                FSTRING_END)
 from parso._compatibility import py_version
 from parso.utils import split_lines
 #fstring_start = /[f|fr|rf]["|"""|'|''']/
 #fstring_end = <same as the second part of the fstring start>
 fstring_expr_start = ''
 fstring_string = r'([^{}\n]+|\{\{|\}\})*'
 fstring_conversion = r'![sra]'
 TokenCollection = namedtuple(
    'TokenCollection',
-    'pseudo_token single_quoted triple_quoted endpats always_break_tokens',
+    'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens',
 )
 BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
@@ -68,7 +62,7 @@ def maybe(*choices):
 # Return the empty string, plus all of the valid string prefixes.
-def _all_string_prefixes(version_info):
+def _all_string_prefixes(version_info, include_fstring=False):
    def different_case_versions(prefix):
        for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
            yield ''.join(s)
@@ -79,7 +73,7 @@ def _all_string_prefixes(version_info):
    if version_info >= (3, 0):
        _valid_string_prefixes.append('br')
-    if version_info >= (3, 6):
+    if version_info >= (3, 6) and include_fstring:
        _valid_string_prefixes += ['f', 'fr']
    # if we add binary f-strings, add: ['fb', 'fbr']
@@ -109,6 +103,10 @@ def _get_token_collection(version_info):
        return result
 fstring_string_single_line = _compile(r'(?:[^{\r\n]+|\{\{)+')
 fstring_string_multi_line = _compile(r'(?:[^{]+|\{\{)+')
 def _create_token_collection(version_info):
    # Note: we use unicode matching for names ("\w") but ascii matching for
    # number literals.
@@ -148,6 +146,9 @@ def _create_token_collection(version_info):
    #  StringPrefix can be the empty string (making it optional).
    possible_prefixes = _all_string_prefixes(version_info)
    StringPrefix = group(*possible_prefixes)
    StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))
    fstring_prefixes = _all_string_prefixes(version_info, include_fstring=True, only_fstring=True)
    FStringStart = group(*fstring_prefixes)
    # Tail end of ' string.
    Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@@ -157,7 +158,7 @@ def _create_token_collection(version_info):
    Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
    # Tail end of """ string.
    Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-    Triple = group(StringPrefix + "'''", StringPrefix + '"""')
+    Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')
    # Because of leftmost-then-longest match semantics, be sure to put the
    # longest operators first (e.g., if = came before ==, == would get
@@ -181,7 +182,11 @@ def _create_token_collection(version_info):
                    group("'", r'\\\r?\n'),
                    StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                    group('"', r'\\\r?\n'))
-    PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
+    pseudo_extra_pool = [Comment, Triple]
    if fstring_prefixes:
        pseudo_extra_pool.append(FStringStart)
    PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool)
    PseudoToken = group(Whitespace, capture=True) + \
        group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
@@ -199,18 +204,24 @@ def _create_token_collection(version_info):
    #  including the opening quotes.
    single_quoted = set()
    triple_quoted = set()
    fstring_endpats = {}
    for t in possible_prefixes:
-        for p in (t + '"', t + "'"):
+        for quote in '"', "'":
-            single_quoted.add(p)
+            single_quoted.add(t + quote)
-        for p in (t + '"""', t + "'''"):
+
-            triple_quoted.add(p)
+        for quote in '"""', "'''":
            triple_quoted.add(t + quote)
    for t in fstring_prefixes:
        for quote in '"', "'", '"""', "'''":
            fstring_endpats[t + quote] = quote
    ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
                           'finally', 'while', 'with', 'return')
    pseudo_token_compiled = _compile(PseudoToken)
    return TokenCollection(
        pseudo_token_compiled, single_quoted, triple_quoted, endpats,
-        ALWAYS_BREAK_TOKENS
+        fstring_endpats, ALWAYS_BREAK_TOKENS
    )
@@ -233,6 +244,72 @@ class PythonToken(Token):
                self._replace(type=self._get_type_name()))
 class FStringNode(object):
    def __init__(self, quote):
        self.quote = quote
        self.parentheses_count = 0
        self.previous_lines = ''
        self.in_format_spec = False
    def open_parentheses(self, character):
        self.parentheses_count += 1
    def close_parentheses(self, character):
        self.parentheses_count -= 1
        return self.parentheses_count == 0
    def allow_multiline(self):
        return len(self.quote == 3)
    def is_in_expr(self):
        return self.parentheses_count and not self.in_format_spec
 def _check_fstring_ending(fstring_stack, token):
    fstring_end = float('inf')
    fstring_index = None
    for i, node in enumerate(fstring_stack):
        try:
            end = token.index(node.quote)
        except ValueError:
            pass
        else:
            if fstring_index is None or end < fstring_end:
                fstring_index = i
                fstring_end = end
    return fstring_index, fstring_end
 def _find_fstring_string(fstring_stack, line, pos):
    tos = fstring_stack[-1]
    if tos.is_in_expr():
        return '', pos
    else:
        new_pos = pos
        allow_multiline = tos.allow_multiline()
        if allow_multiline:
            match = fstring_string_multi_line.match(line, pos)
        else:
            match = fstring_string_single_line.match(line, pos)
        if match is None:
            string = fstring_stack.previous_lines
        else:
            string = match.group(0)
            for fstring_stack_node in fstring_stack:
                try:
                    string = string[:string.index(fstring_stack_node.quote)]
                except ValueError:
                    pass  # The string was not found.
            new_pos += len(string)
            if allow_multiline and string.endswith('\n'):
                fstring_stack.previous_lines += string
                string = ''
            else:
                string = fstring_stack_node.previous_lines + string
        return string, new_pos
 def tokenize(code, version_info, start_pos=(1, 0)):
    """Generate tokens from a the source code (string)."""
    lines = split_lines(code, keepends=True)
@@ -247,7 +324,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
    token. This idea comes from lib2to3. The prefix contains all information
    that is irrelevant for the parser like newlines in parentheses or comments.
    """
-    pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \
+    pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \
        _get_token_collection(version_info)
    paren_level = 0  # count parentheses
    indents = [0]
@@ -264,6 +341,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
    additional_prefix = ''
    first = True
    lnum = start_pos[0] - 1
    fstring_stack = []
    for line in lines:  # loop over lines in stream
        lnum += 1
        pos = 0
@@ -294,6 +372,13 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                continue
        while pos < max:
            if fstring_stack:
                string, pos = _find_fstring_string(fstring_stack, line, pos)
                if string:
                    fstring_stack.previous_lines = ''
                    yield PythonToken(FSTRING_STRING, string, (lnum, pos), '')
                    continue
            pseudomatch = pseudo_token.match(line, pos)
            if not pseudomatch:                             # scan for tokens
                txt = line[pos:]
@@ -318,10 +403,11 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
            if new_line and initial not in '\r\n#':
                new_line = False
-                if paren_level == 0:
+                if paren_level == 0 and not fstring_stack:
                    i = 0
                    while line[i] == '\f':
                        i += 1
                        # TODO don't we need to change spos as well?
                        start -= 1
                    if start > indents[-1]:
                        yield PythonToken(INDENT, '', spos, '')
@@ -333,11 +419,30 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                        yield PythonToken(DEDENT, '', spos, '')
                        indents.pop()
            if fstring_stack:
                fstring_index, end = _check_fstring_ending(fstring_stack, token)
                if fstring_index is not None:
                    if end != 0:
                        yield PythonToken(ERRORTOKEN, token[:end], spos, prefix)
                    yield PythonToken(
                        FSTRING_END,
                        fstring_stack[fstring_index].quote,
                        (lnum, spos[1] + 1),
                        prefix=''
                    )
                    del fstring_index[fstring_index:]
                    pos -= len(token) - end
                    continue
            if (initial in numchars or                      # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
                yield PythonToken(NUMBER, token, spos, prefix)
            elif initial in '\r\n':
-                if not new_line and paren_level == 0:
+                if any(not f.allow_multiline() for f in fstring_stack):
                    fstring_stack.clear()
                if not new_line and paren_level == 0 and not fstring_stack:
                    yield PythonToken(NEWLINE, token, spos, prefix)
                else:
                    additional_prefix = prefix + token
@@ -369,8 +474,12 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                    break
                else:                                       # ordinary string
                    yield PythonToken(STRING, token, spos, prefix)
            elif token in fstring_endpats:
                fstring_stack.append(FStringNode(fstring_endpats[token]))
                yield PythonToken(FSTRING_START, token, spos, prefix)
            elif is_identifier(initial):                      # ordinary name
                if token in always_break_tokens:
                    fstring_stack.clear()
                    paren_level = 0
                    while True:
                        indent = indents.pop()
@@ -385,9 +494,19 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                break
            else:
                if token in '([{':
-                    paren_level += 1
+                    if fstring_stack:
                        fstring_stack[-1].open_bracket(token)
                    else:
                        paren_level += 1
                elif token in ')]}':
-                    paren_level -= 1
+                    if fstring_stack:
                        if fstring_stack[-1].close_parentheses(token):
                            fstring_stack.pop()
                    else:
                        paren_level -= 1
                elif token == ':' and fstring_stack \
                        and fstring_stack[-1].parentheses_count == 1:
                    fstring_stack[-1].in_format_spec = True
                try:
                    # This check is needed in any case to check if it's a valid