Remove the old f-string grammar and fix the tests with the new syntax

2025-12-20 03:11:17 +08:00 · 2018-04-07 02:11:26 +02:00
parent 9941348ec6
commit 8f1a436ba1
5 changed files with 66 additions and 293 deletions
--- a/parso/grammar.py
+++ b/parso/grammar.py
@@ -12,7 +12,6 @@ from parso.parser import BaseParser
 from parso.python.parser import Parser as PythonParser
 from parso.python.errors import ErrorFinderConfig
 from parso.python import pep8
 from parso.python import fstring
 _loaded_grammars = {}
@@ -186,7 +185,6 @@ class Grammar(object):
        normalizer.walk(node)
        return normalizer.issues
    def __repr__(self):
        labels = self._pgen_grammar.number2symbol.values()
        txt = ' '.join(list(labels)[:3]) + ' ...'
@@ -215,34 +213,6 @@ class PythonGrammar(Grammar):
        return tokenize(code, self.version_info)
 class PythonFStringGrammar(Grammar):
    _token_namespace = fstring.TokenNamespace
    _start_symbol = 'fstring'
    def __init__(self):
        super(PythonFStringGrammar, self).__init__(
            text=fstring.GRAMMAR,
            tokenizer=fstring.tokenize,
            parser=fstring.Parser
        )
    def parse(self, code, **kwargs):
        return self._parse(code, **kwargs)
    def _parse(self, code, error_recovery=True, start_pos=(1, 0)):
        tokens = self._tokenizer(code, start_pos=start_pos)
        p = self._parser(
            self._pgen_grammar,
            error_recovery=error_recovery,
            start_symbol=self._start_symbol,
        )
        return p.parse(tokens=tokens)
    def parse_leaf(self, leaf, error_recovery=True):
        code = leaf._get_payload()
        return self.parse(code, error_recovery=True, start_pos=leaf.start_pos)
 def load_grammar(**kwargs):
    """
    Loads a :py:class:`parso.Grammar`. The default version is the current Python
@@ -273,10 +243,6 @@ def load_grammar(**kwargs):
                except FileNotFoundError:
                    message = "Python version %s is currently not supported." % version
                    raise NotImplementedError(message)
        elif language == 'python-f-string':
            if version is not None:
                raise NotImplementedError("Currently different versions are not supported.")
            return PythonFStringGrammar()
        else:
            raise NotImplementedError("No support for language %s." % language)
--- a/parso/pgen2/parse.py
+++ b/parso/pgen2/parse.py
@@ -124,7 +124,9 @@ class PgenParser(object):
        self.error_recovery = error_recovery
    def parse(self, tokens):
-        for type_, value, start_pos, prefix in tokens:
+        for tok in tokens:
            print(tok)
            type_, value, start_pos, prefix = tok
            if self.add_token(type_, value, start_pos, prefix):
                break
        else:
--- a/parso/python/fstring.py
+++ b/parso/python/fstring.py
@@ -1,211 +0,0 @@
 import re
 from itertools import count
 from parso.utils import PythonVersionInfo
 from parso.utils import split_lines
 from parso.python.tokenize import Token
 from parso import parser
 from parso.tree import TypedLeaf, ErrorNode, ErrorLeaf
 version36 = PythonVersionInfo(3, 6)
 class TokenNamespace:
    _c = count()
    LBRACE = next(_c)
    RBRACE = next(_c)
    ENDMARKER = next(_c)
    COLON = next(_c)
    CONVERSION = next(_c)
    PYTHON_EXPR = next(_c)
    EXCLAMATION_MARK = next(_c)
    UNTERMINATED_STRING = next(_c)
    token_map = dict((v, k) for k, v in locals().items() if not k.startswith('_'))
    @classmethod
    def generate_token_id(cls, string):
        if string == '{':
            return cls.LBRACE
        elif string == '}':
            return cls.RBRACE
        elif string == '!':
            return cls.EXCLAMATION_MARK
        elif string == ':':
            return cls.COLON
        return getattr(cls, string)
 GRAMMAR = """
 fstring: expression* ENDMARKER
 format_spec: ':' expression*
 expression: '{' PYTHON_EXPR [ '!' CONVERSION ] [ format_spec ] '}'
 """
 _prefix = r'((?:[^{}]+)*)'
 _expr = _prefix + r'(\{|\}|$)'
 _in_expr = r'([^{}\[\]:"\'!]*)(.?)'
 # There's only one conversion character allowed. But the rules have to be
 # checked later anyway, so allow more here. This makes error recovery nicer.
 _conversion = r'([^={}:]*)(.?)'
 _compiled_expr = re.compile(_expr)
 _compiled_in_expr = re.compile(_in_expr)
 _compiled_conversion = re.compile(_conversion)
 def tokenize(code, start_pos=(1, 0)):
    def add_to_pos(string):
        lines = split_lines(string)
        l = len(lines[-1])
        if len(lines) > 1:
            start_pos[0] += len(lines) - 1
            start_pos[1] = l
        else:
            start_pos[1] += l
    def tok(value, type=None, prefix=''):
        if type is None:
            type = TokenNamespace.generate_token_id(value)
        add_to_pos(prefix)
        token = Token(type, value, tuple(start_pos), prefix)
        add_to_pos(value)
        return token
    start = 0
    recursion_level = 0
    added_prefix = ''
    start_pos = list(start_pos)
    while True:
        match = _compiled_expr.match(code, start)
        prefix = added_prefix + match.group(1)
        found = match.group(2)
        start = match.end()
        if not found:
            # We're at the end.
            break
        if found == '}':
            if recursion_level == 0 and len(code) > start  and code[start] == '}':
                # This is a }} escape.
                added_prefix = prefix + '}}'
                start += 1
                continue
            recursion_level = max(0, recursion_level - 1)
            yield tok(found, prefix=prefix)
            added_prefix = ''
        else:
            assert found == '{'
            if recursion_level == 0 and len(code) > start and code[start] == '{':
                # This is a {{ escape.
                added_prefix = prefix + '{{'
                start += 1
                continue
            recursion_level += 1
            yield tok(found, prefix=prefix)
            added_prefix = ''
            expression = ''
            squared_count = 0
            curly_count = 0
            while True:
                expr_match = _compiled_in_expr.match(code, start)
                expression += expr_match.group(1)
                found = expr_match.group(2)
                start = expr_match.end()
                if found == '{':
                    curly_count += 1
                    expression += found
                elif found == '}' and curly_count > 0:
                    curly_count -= 1
                    expression += found
                elif found == '[':
                    squared_count += 1
                    expression += found
                elif found == ']':
                    # Use a max function here, because the Python code might
                    # just have syntax errors.
                    squared_count = max(0, squared_count - 1)
                    expression += found
                elif found == ':' and (squared_count or curly_count):
                    expression += found
                elif found in ('"', "'"):
                    search = found
                    if len(code) > start + 1 and  \
                            code[start] == found == code[start+1]:
                        search *= 3
                        start += 2
                    index = code.find(search, start)
                    if index == -1:
                        yield tok(expression, type=TokenNamespace.PYTHON_EXPR)
                        yield tok(
                            found + code[start:],
                            type=TokenNamespace.UNTERMINATED_STRING,
                        )
                        start = len(code)
                        break
                    expression += found + code[start:index+1]
                    start = index + 1
                elif found == '!' and len(code) > start and code[start] == '=':
                    # This is a python `!=` and not a conversion.
                    expression += found
                else:
                    yield tok(expression, type=TokenNamespace.PYTHON_EXPR)
                    if found:
                        yield tok(found)
                    break
            if found == '!':
                conversion_match = _compiled_conversion.match(code, start)
                found = conversion_match.group(2)
                start = conversion_match.end()
                yield tok(conversion_match.group(1), type=TokenNamespace.CONVERSION)
                if found:
                    yield tok(found)
            if found == '}':
                recursion_level -= 1
            # We don't need to handle everything after ':', because that is
            # basically new tokens.
    yield tok('', type=TokenNamespace.ENDMARKER, prefix=prefix)
 class Parser(parser.BaseParser):
    def parse(self, tokens):
        node = super(Parser, self).parse(tokens)
        if isinstance(node, self.default_leaf):  # Is an endmarker.
            # If there's no curly braces we get back a non-module. We always
            # want an fstring.
            node = self.default_node('fstring', [node])
        return node
    def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos):
        # TODO this is so ugly.
        leaf_type = TokenNamespace.token_map[type].lower()
        return TypedLeaf(leaf_type, value, start_pos, prefix)
    def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
                       add_token_callback):
        if not self._error_recovery:
            return super(Parser, self).error_recovery(
                pgen_grammar, stack, arcs, typ, value, start_pos, prefix,
                add_token_callback
            )
        token_type = TokenNamespace.token_map[typ].lower()
        if len(stack) == 1:
            error_leaf = ErrorLeaf(token_type, value, start_pos, prefix)
            stack[0][2][1].append(error_leaf)
        else:
            dfa, state, (type_, nodes) = stack[1]
            stack[0][2][1].append(ErrorNode(nodes))
            stack[1:] = []
            add_token_callback(typ, value, start_pos, prefix)
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -106,8 +106,8 @@ def _get_token_collection(version_info):
        return result
-fstring_string_single_line = _compile(r'(?:[^{\r\n]+|\{\{)+')
+fstring_string_single_line = _compile(r'(?:[^{}\r\n]+|\{\{|\}\})+')
-fstring_string_multi_line = _compile(r'(?:[^{]+|\{\{)+')
+fstring_string_multi_line = _compile(r'(?:[^{}]+|\{\{|\}\})+')
 def _create_token_collection(version_info):
@@ -253,7 +253,10 @@ class FStringNode(object):
        self.quote = quote
        self.parentheses_count = 0
        self.previous_lines = ''
-        self.in_format_spec = False
+        self.last_string_start_pos = None
        # In the syntax there can be multiple format_spec's nested:
        # {x:{y:3}}
        self.format_spec_count = 0
    def open_parentheses(self, character):
        self.parentheses_count += 1
@@ -265,7 +268,7 @@ class FStringNode(object):
        return len(self.quote) == 3
    def is_in_expr(self):
-        return self.parentheses_count
+        return (self.parentheses_count - self.format_spec_count) > 0
 def _check_fstring_ending(fstring_stack, token, from_start=False):
@@ -290,7 +293,7 @@ def _check_fstring_ending(fstring_stack, token, from_start=False):
    return fstring_index, fstring_end
-def _find_fstring_string(fstring_stack, line, pos):
+def _find_fstring_string(fstring_stack, line, lnum, pos):
    tos = fstring_stack[-1]
    if tos.is_in_expr():
        return '', pos
@@ -302,8 +305,12 @@ def _find_fstring_string(fstring_stack, line, pos):
        else:
            match = fstring_string_single_line.match(line, pos)
        if match is None:
-            string = fstring_stack[-1].previous_lines
+            string = tos.previous_lines
        else:
            print(match, lnum, pos, repr(tos.previous_lines))
            if not tos.previous_lines:
                tos.last_string_start_pos = (lnum, pos)
            string = match.group(0)
            for fstring_stack_node in fstring_stack:
                try:
@@ -313,12 +320,11 @@ def _find_fstring_string(fstring_stack, line, pos):
            new_pos += len(string)
            if allow_multiline and string.endswith('\n'):
-                fstring_stack[-1].previous_lines += string
+                tos.previous_lines += string
                string = ''
            else:
-                string = fstring_stack[-1].previous_lines + string
+                string = tos.previous_lines + string
        fstring_stack[-1].previous_lines = ''
        return string, new_pos
@@ -385,25 +391,31 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
        while pos < max:
            if fstring_stack:
-                string, pos = _find_fstring_string(fstring_stack, line, pos)
+                string, pos = _find_fstring_string(fstring_stack, line, lnum, pos)
                if string:
-                    yield PythonToken(FSTRING_STRING, string, (lnum, pos), '')
+                    yield PythonToken(
                        FSTRING_STRING, string,
                        fstring_stack[-1].last_string_start_pos, ''
                    )
                    fstring_stack[-1].previous_lines = ''
                    continue
-                if pos < max:
+                if pos == max:
-                    rest = line[pos:]
+                    break
                    fstring_index, end = _check_fstring_ending(fstring_stack, rest, from_start=True)
-                    if fstring_index is not None:
+                rest = line[pos:]
-                        yield PythonToken(
+                fstring_index, end = _check_fstring_ending(fstring_stack, rest, from_start=True)
-                            FSTRING_END,
+
-                            fstring_stack[fstring_index].quote,
+                if fstring_index is not None:
-                            (lnum, pos),
+                    yield PythonToken(
-                            prefix=''
+                        FSTRING_END,
-                        )
+                        fstring_stack[fstring_index].quote,
-                        del fstring_stack[fstring_index:]
+                        (lnum, pos),
-                        pos += end
+                        prefix=''
-                        continue
+                    )
                    del fstring_stack[fstring_index:]
                    pos += end
                    continue
            pseudomatch = pseudo_token.match(line, pos)
            if not pseudomatch:                             # scan for tokens
@@ -531,7 +543,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                        paren_level -= 1
                elif token == ':' and fstring_stack \
                        and fstring_stack[-1].parentheses_count == 1:
-                    fstring_stack[-1].in_format_spec = True
+                    fstring_stack[-1].format_spec_count += 1
                try:
                    # This check is needed in any case to check if it's a valid
--- a/test/test_fstring.py
+++ b/test/test_fstring.py
@@ -1,17 +1,18 @@
 import pytest
 from parso import load_grammar, ParserSyntaxError
-from parso.python.fstring import tokenize
+from parso.python.tokenize import tokenize
@pytest.fixture
 def grammar():
-    return load_grammar(language="python-f-string")
+    return load_grammar(version='3.6')
@pytest.mark.parametrize(
    'code', [
        '{1}',
        '{1:}',
        '',
        '{1!a}',
        '{1!a:1}',
@@ -26,22 +27,12 @@ def grammar():
        '{{{1}',
        '1{{2{{3',
        '}}',
        '{:}}}',
        # Invalid, but will be checked, later.
        '{}',
        '{1:}',
        '{:}',
        '{:1}',
        '{!:}',
        '{!}',
        '{!a}',
        '{1:{}}',
        '{1:{:}}',
    ]
 )
 def test_valid(code, grammar):
-    fstring = grammar.parse(code, error_recovery=False)
+    code = 'f"""%s"""' % code
    module = grammar.parse(code, error_recovery=False)
    fstring = module.children[0]
    assert fstring.type == 'fstring'
    assert fstring.get_code() == code
@@ -52,24 +43,37 @@ def test_valid(code, grammar):
        '{',
        '{1!{a}}',
        '{!{a}}',
        '{}',
        '{:}',
        '{:}}}',
        '{:1}',
        '{!:}',
        '{!}',
        '{!a}',
        '{1:{}}',
        '{1:{:}}',
    ]
 )
 def test_invalid(code, grammar):
    code = 'f"""%s"""' % code
    with pytest.raises(ParserSyntaxError):
        grammar.parse(code, error_recovery=False)
    # It should work with error recovery.
-    #grammar.parse(code, error_recovery=True)
+    grammar.parse(code, error_recovery=True)
@pytest.mark.parametrize(
-    ('code', 'start_pos', 'positions'), [
+    ('code', 'positions'), [
        # 2 times 2, 5 because python expr and endmarker.
-        ('}{', (2, 3), [(2, 3), (2, 4), (2, 5), (2, 5)]),
+        ('f"}{"', [(1, 0), (1, 2), (1, 3), (1, 4), (1, 5)]),
-        (' :{ 1 : } ', (1, 0), [(1, 2), (1, 3), (1, 6), (1, 8), (1, 10)]),
+        ('f" :{ 1 : } "', [(1, 0), (1, 2), (1, 4), (1, 6), (1, 8), (1, 9),
-        ('\n{\nfoo\n }', (2, 1), [(3, 0), (3, 1), (5, 1), (5, 2)]),
+                           (1, 10), (1, 11), (1, 12), (1, 13)]),
        ('f"""\n {\nfoo\n }"""', [(1, 0), (1, 4), (2, 1), (3, 0), (4, 1),
                                  (4, 2), (4, 5)]),
    ]
 )
-def test_tokenize_start_pos(code, start_pos, positions):
+def test_tokenize_start_pos(code, positions):
-    tokens = tokenize(code, start_pos)
+    tokens = list(tokenize(code, version_info=(3, 6)))
    print(tokens)
    assert positions == [p.start_pos for p in tokens]