Fix a few things so that the tokenizer can at least parse the grammar.

2026-03-13 02:10:55 +08:00 · 2018-03-30 22:13:18 +02:00
parent d8d2e596a5
commit 235fda3fbb
3 changed files with 31 additions and 22 deletions
--- a/parso/pgen2/pgen.py
+++ b/parso/pgen2/pgen.py
@@ -28,6 +28,7 @@ class ParserGenerator(object):
        c = grammar.Grammar(self._bnf_text)
        names = list(self.dfas.keys())
        names.sort()
+        # TODO do we still need this?
        names.remove(self.startsymbol)
        names.insert(0, self.startsymbol)
        for name in names:
@@ -316,8 +317,8 @@ class ParserGenerator(object):

    def _expect(self, type):
        if self.type != type:
-            self._raise_error("expected %s, got %s(%s)",
-                              type, self.type, self.value)
+            self._raise_error("expected %s(%s), got %s(%s)",
+                              type, token.tok_name[type], self.type, self.value)
        value = self.value
        self._gettoken()
        return value
--- a/parso/python/token.py
+++ b/parso/python/token.py
@@ -38,6 +38,8 @@ FSTRING_END = next(_counter)
 tok_name[FSTRING_END] = 'FSTRING_END'
 FSTRING_STRING = next(_counter)
 tok_name[FSTRING_STRING] = 'FSTRING_STRING'
+EXCLAMATION = next(_counter)
+tok_name[EXCLAMATION] = 'EXCLAMATION'

 # Map from operator to number (since tokenize doesn't do this)

@@ -90,6 +92,7 @@ opmap_raw = """\
 //= DOUBLESLASHEQUAL
 -> RARROW
 ... ELLIPSIS
+! EXCLAMATION
 """

 opmap = {}
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -28,7 +28,7 @@ from parso.utils import split_lines

 TokenCollection = namedtuple(
    'TokenCollection',
-    'pseudo_token single_quoted triple_quoted endpats fstring_endpats always_break_tokens',
+    'pseudo_token single_quoted triple_quoted endpats fstring_pattern_map always_break_tokens',
 )

 BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
@@ -53,32 +53,35 @@ def group(*choices, **kwargs):
    return start + '|'.join(choices) + ')'


-def any(*choices):
-    return group(*choices) + '*'
-
-
 def maybe(*choices):
    return group(*choices) + '?'


 # Return the empty string, plus all of the valid string prefixes.
-def _all_string_prefixes(version_info, include_fstring=False):
+def _all_string_prefixes(version_info, include_fstring=False, only_fstring=False):
    def different_case_versions(prefix):
        for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
            yield ''.join(s)
    # The valid string prefixes. Only contain the lower case versions,
    #  and don't contain any permuations (include 'fr', but not
    #  'rf'). The various permutations will be generated.
-    _valid_string_prefixes = ['b', 'r', 'u']
+    valid_string_prefixes = ['b', 'r', 'u']
    if version_info >= (3, 0):
-        _valid_string_prefixes.append('br')
+        valid_string_prefixes.append('br')

+    result = {''}
    if version_info >= (3, 6) and include_fstring:
-        _valid_string_prefixes += ['f', 'fr']
+        f = ['f', 'fr']
+        if only_fstring:
+            valid_string_prefixes = f
+            result = set()
+        else:
+            valid_string_prefixes += f
+    elif only_fstring:
+        return set()

    # if we add binary f-strings, add: ['fb', 'fbr']
-    result = set([''])
-    for prefix in _valid_string_prefixes:
+    for prefix in valid_string_prefixes:
        for t in _itertools.permutations(prefix):
            # create a list with upper and lower versions of each
            #  character
@@ -183,10 +186,11 @@ def _create_token_collection(version_info):
                    StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                    group('"', r'\\\r?\n'))
    pseudo_extra_pool = [Comment, Triple]
+    all_quotes = '"', "'", '"""', "'''"
    if fstring_prefixes:
-        pseudo_extra_pool.append(FStringStart)
+        pseudo_extra_pool.append(FStringStart + group(*all_quotes))

-    PseudoExtras = group(r'\\\r?\n|\Z', pseudo_extra_pool)
+    PseudoExtras = group(r'\\\r?\n|\Z', *pseudo_extra_pool)
    PseudoToken = group(Whitespace, capture=True) + \
        group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)

@@ -204,7 +208,7 @@ def _create_token_collection(version_info):
    #  including the opening quotes.
    single_quoted = set()
    triple_quoted = set()
-    fstring_endpats = {}
+    fstring_pattern_map = {}
    for t in possible_prefixes:
        for quote in '"', "'":
            single_quoted.add(t + quote)
@@ -213,15 +217,15 @@ def _create_token_collection(version_info):
            triple_quoted.add(t + quote)

    for t in fstring_prefixes:
-        for quote in '"', "'", '"""', "'''":
-            fstring_endpats[t + quote] = quote
+        for quote in all_quotes:
+            fstring_pattern_map[t + quote] = quote

    ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
                           'finally', 'while', 'with', 'return')
    pseudo_token_compiled = _compile(PseudoToken)
    return TokenCollection(
        pseudo_token_compiled, single_quoted, triple_quoted, endpats,
-        fstring_endpats, ALWAYS_BREAK_TOKENS
+        fstring_pattern_map, ALWAYS_BREAK_TOKENS
    )


@@ -324,7 +328,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
    token. This idea comes from lib2to3. The prefix contains all information
    that is irrelevant for the parser like newlines in parentheses or comments.
    """
-    pseudo_token, single_quoted, triple_quoted, endpats, fstring_endpats, always_break_tokens, = \
+    pseudo_token, single_quoted, triple_quoted, endpats, fstring_pattern_map, always_break_tokens, = \
        _get_token_collection(version_info)
    paren_level = 0  # count parentheses
    indents = [0]
@@ -372,6 +376,7 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                continue

        while pos < max:
+            assert not fstring_stack
            if fstring_stack:
                string, pos = _find_fstring_string(fstring_stack, line, pos)
                if string:
@@ -474,8 +479,8 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                    break
                else:                                       # ordinary string
                    yield PythonToken(STRING, token, spos, prefix)
-            elif token in fstring_endpats:
-                fstring_stack.append(FStringNode(fstring_endpats[token]))
+            elif token in fstring_pattern_map:  # The start of an fstring.
+                fstring_stack.append(FStringNode(fstring_pattern_map[token]))
                yield PythonToken(FSTRING_START, token, spos, prefix)
            elif is_identifier(initial):                      # ordinary name
                if token in always_break_tokens: