A first implementation of the fstring tokenizer

2026-03-01 04:52:48 +08:00 · 2018-03-30 20:50:49 +02:00
parent e05ce5ae31
commit d8d2e596a5
3 changed files with 144 additions and 26 deletions
--- a/parso/python/grammar36.txt
+++ b/parso/python/grammar36.txt
@@ -152,5 +152,6 @@ yield_arg: 'from' test | testlist
 strings: (STRING | fstring)+
 fstring: FSTRING_START fstring_content FSTRING_END
 fstring_content: (FSTRING_STRING | fstring_expr)*
-fstring_expr: '{' testlist  [ FSTRING_CONVERSION ] [ fstring_format_spec ] '}'
+fstring_conversion: '!' NAME
+fstring_expr: '{' testlist  [ fstring_conversion ] [ fstring_format_spec ] '}'
 fstring_format_spec: ':' fstring_content
--- a/parso/python/token.py
+++ b/parso/python/token.py
@@ -38,8 +38,6 @@ FSTRING_END = next(_counter)
 tok_name[FSTRING_END] = 'FSTRING_END'
 FSTRING_STRING = next(_counter)
 tok_name[FSTRING_STRING] = 'FSTRING_STRING'
-FSTRING_CONVERSION = next(_counter)
-tok_name[FSTRING_CONVERSION] = 'FSTRING_CONVERSION'

 # Map from operator to number (since tokenize doesn't do this)

--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -20,21 +20,15 @@ from codecs import BOM_UTF8

 from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
                                NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
-                                ERROR_DEDENT)
+                                ERROR_DEDENT, FSTRING_STRING, FSTRING_START,
+                                FSTRING_END)
 from parso._compatibility import py_version
 from parso.utils import split_lines


-#fstring_start = /[f|fr|rf]["|"""|'|''']/
-#fstring_end = <same as the second part of the fstring start>
-fstring_expr_start = ''
-fstring_string = r'([^{}\n]+|\{\{|\}\})*'
-fstring_conversion = r'![sra]'
-
-
 TokenCollection = namedtuple(
    'TokenCollection',
-    'pseudo_token single_quoted triple_quoted endpats always_break_tokens',
+    'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens',
 )

 BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
@@ -68,7 +62,7 @@ def maybe(*choices):


 # Return the empty string, plus all of the valid string prefixes.
-def _all_string_prefixes(version_info):
+def _all_string_prefixes(version_info, include_fstring=False):
    def different_case_versions(prefix):
        for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
            yield ''.join(s)
@@ -79,7 +73,7 @@ def _all_string_prefixes(version_info):
    if version_info >= (3, 0):
        _valid_string_prefixes.append('br')

-    if version_info >= (3, 6):
+    if version_info >= (3, 6) and include_fstring:
        _valid_string_prefixes += ['f', 'fr']

    # if we add binary f-strings, add: ['fb', 'fbr']
@@ -109,6 +103,10 @@ def _get_token_collection(version_info):
        return result


+fstring_string_single_line = _compile(r'(?:[^{\r\n]+|\{\{)+')
+fstring_string_multi_line = _compile(r'(?:[^{]+|\{\{)+')
+
+
 def _create_token_collection(version_info):
    # Note: we use unicode matching for names ("\w") but ascii matching for
    # number literals.
@@ -148,6 +146,9 @@ def _create_token_collection(version_info):
    #  StringPrefix can be the empty string (making it optional).
    possible_prefixes = _all_string_prefixes(version_info)
    StringPrefix = group(*possible_prefixes)
+    StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))
+    fstring_prefixes = _all_string_prefixes(version_info, include_fstring=True, only_fstring=True)
+    FStringStart = group(*fstring_prefixes)

    # Tail end of ' string.
    Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@@ -157,7 +158,7 @@ def _create_token_collection(version_info):
    Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
    # Tail end of """ string.
    Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-    Triple = group(StringPrefix + "'''", StringPrefix + '"""')
+    Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')

    # Because of leftmost-then-longest match semantics, be sure to put the
    # longest operators first (e.g., if = came before ==, == would get
@@ -181,7 +182,11 @@ def _create_token_collection(version_info):
                    group("'", r'\\\r?\n'),
                    StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                    group('"', r'\\\r?\n'))
-    PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
+    pseudo_extra_pool = [Comment, Triple]
+    if fstring_prefixes:
+        pseudo_extra_pool.append(FStringStart)
+
+    PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool)
    PseudoToken = group(Whitespace, capture=True) + \
        group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)

@@ -199,18 +204,24 @@ def _create_token_collection(version_info):
    #  including the opening quotes.
    single_quoted = set()
    triple_quoted = set()
+    fstring_endpats = {}
    for t in possible_prefixes:
-        for p in (t + '"', t + "'"):
-            single_quoted.add(p)
-        for p in (t + '"""', t + "'''"):
-            triple_quoted.add(p)
+        for quote in '"', "'":
+            single_quoted.add(t + quote)
+
+        for quote in '"""', "'''":
+            triple_quoted.add(t + quote)
+
+    for t in fstring_prefixes:
+        for quote in '"', "'", '"""', "'''":
+            fstring_endpats[t + quote] = quote

    ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
                           'finally', 'while', 'with', 'return')
    pseudo_token_compiled = _compile(PseudoToken)
    return TokenCollection(
        pseudo_token_compiled, single_quoted, triple_quoted, endpats,
-        ALWAYS_BREAK_TOKENS
+        fstring_endpats, ALWAYS_BREAK_TOKENS
    )


@@ -233,6 +244,72 @@ class PythonToken(Token):
                self._replace(type=self._get_type_name()))


+class FStringNode(object):
+    def __init__(self, quote):
+        self.quote = quote
+        self.parentheses_count = 0
+        self.previous_lines = ''
+        self.in_format_spec = False
+
+    def open_parentheses(self, character):
+        self.parentheses_count += 1
+
+    def close_parentheses(self, character):
+        self.parentheses_count -= 1
+        return self.parentheses_count == 0
+
+    def allow_multiline(self):
+        return len(self.quote == 3)
+
+    def is_in_expr(self):
+        return self.parentheses_count and not self.in_format_spec
+
+
+def _check_fstring_ending(fstring_stack, token):
+    fstring_end = float('inf')
+    fstring_index = None
+    for i, node in enumerate(fstring_stack):
+        try:
+            end = token.index(node.quote)
+        except ValueError:
+            pass
+        else:
+            if fstring_index is None or end < fstring_end:
+                fstring_index = i
+                fstring_end = end
+    return fstring_index, fstring_end
+
+
+def _find_fstring_string(fstring_stack, line, pos):
+    tos = fstring_stack[-1]
+    if tos.is_in_expr():
+        return '', pos
+    else:
+        new_pos = pos
+        allow_multiline = tos.allow_multiline()
+        if allow_multiline:
+            match = fstring_string_multi_line.match(line, pos)
+        else:
+            match = fstring_string_single_line.match(line, pos)
+        if match is None:
+            string = fstring_stack.previous_lines
+        else:
+            string = match.group(0)
+            for fstring_stack_node in fstring_stack:
+                try:
+                    string = string[:string.index(fstring_stack_node.quote)]
+                except ValueError:
+                    pass  # The string was not found.
+
+            new_pos += len(string)
+            if allow_multiline and string.endswith('\n'):
+                fstring_stack.previous_lines += string
+                string = ''
+            else:
+                string = fstring_stack_node.previous_lines + string
+        return string, new_pos
+
+
 def tokenize(code, version_info, start_pos=(1, 0)):
    """Generate tokens from a the source code (string)."""
    lines = split_lines(code, keepends=True)
@@ -247,7 +324,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
    token. This idea comes from lib2to3. The prefix contains all information
    that is irrelevant for the parser like newlines in parentheses or comments.
    """
-    pseudo_token, single_quoted, triple_quoted, endpats, always_break_tokens, = \
+    pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \
        _get_token_collection(version_info)
    paren_level = 0  # count parentheses
    indents = [0]
@@ -264,6 +341,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
    additional_prefix = ''
    first = True
    lnum = start_pos[0] - 1
+    fstring_stack = []
    for line in lines:  # loop over lines in stream
        lnum += 1
        pos = 0
@@ -294,6 +372,13 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                continue

        while pos < max:
+            if fstring_stack:
+                string, pos = _find_fstring_string(fstring_stack, line, pos)
+                if string:
+                    fstring_stack.previous_lines = ''
+                    yield PythonToken(FSTRING_STRING, string, (lnum, pos), '')
+                    continue
+
            pseudomatch = pseudo_token.match(line, pos)
            if not pseudomatch:                             # scan for tokens
                txt = line[pos:]
@@ -318,10 +403,11 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):

            if new_line and initial not in '\r\n#':
                new_line = False
-                if paren_level == 0:
+                if paren_level == 0 and not fstring_stack:
                    i = 0
                    while line[i] == '\f':
                        i += 1
+                        # TODO don't we need to change spos as well?
                        start -= 1
                    if start > indents[-1]:
                        yield PythonToken(INDENT, '', spos, '')
@@ -333,11 +419,30 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                        yield PythonToken(DEDENT, '', spos, '')
                        indents.pop()

+            if fstring_stack:
+                fstring_index, end = _check_fstring_ending(fstring_stack, token)
+                if fstring_index is not None:
+                    if end != 0:
+                        yield PythonToken(ERRORTOKEN, token[:end], spos, prefix)
+
+                    yield PythonToken(
+                        FSTRING_END,
+                        fstring_stack[fstring_index].quote,
+                        (lnum, spos[1] + 1),
+                        prefix=''
+                    )
+                    del fstring_index[fstring_index:]
+                    pos -= len(token) - end
+                    continue
+
            if (initial in numchars or                      # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
                yield PythonToken(NUMBER, token, spos, prefix)
            elif initial in '\r\n':
-                if not new_line and paren_level == 0:
+                if any(not f.allow_multiline() for f in fstring_stack):
+                    fstring_stack.clear()
+
+                if not new_line and paren_level == 0 and not fstring_stack:
                    yield PythonToken(NEWLINE, token, spos, prefix)
                else:
                    additional_prefix = prefix + token
@@ -369,8 +474,12 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                    break
                else:                                       # ordinary string
                    yield PythonToken(STRING, token, spos, prefix)
+            elif token in fstring_endpats:
+                fstring_stack.append(FStringNode(fstring_endpats[token]))
+                yield PythonToken(FSTRING_START, token, spos, prefix)
            elif is_identifier(initial):                      # ordinary name
                if token in always_break_tokens:
+                    fstring_stack.clear()
                    paren_level = 0
                    while True:
                        indent = indents.pop()
@@ -385,9 +494,19 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                break
            else:
                if token in '([{':
-                    paren_level += 1
+                    if fstring_stack:
+                        fstring_stack[-1].open_bracket(token)
+                    else:
+                        paren_level += 1
                elif token in ')]}':
-                    paren_level -= 1
+                    if fstring_stack:
+                        if fstring_stack[-1].close_parentheses(token):
+                            fstring_stack.pop()
+                    else:
+                        paren_level -= 1
+                elif token == ':' and fstring_stack \
+                        and fstring_stack[-1].parentheses_count == 1:
+                    fstring_stack[-1].in_format_spec = True

                try:
                    # This check is needed in any case to check if it's a valid