Fix tokenizer for random invalid unicode points

2026-03-07 07:32:17 +08:00 · 2020-03-28 21:01:57 +01:00
parent 38b7763e9a
commit a950b82066
2 changed files with 39 additions and 30 deletions
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -496,33 +496,25 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                pseudomatch = pseudo_token.match(string_line, pos)
            else:
                pseudomatch = pseudo_token.match(line, pos)
            if not pseudomatch:  # scan for tokens
                match = whitespace.match(line, pos)
                if pos == 0 and paren_level == 0:
                    for t in dedent_if_necessary(match.end()):
                        yield t
                pos = match.end()
                new_line = False
                yield PythonToken(
                    ERRORTOKEN, line[pos], (lnum, pos),
                    additional_prefix + match.group(0)
                )
                additional_prefix = ''
                pos += 1
                continue
-            prefix = additional_prefix + pseudomatch.group(1)
+            if pseudomatch:
-            additional_prefix = ''
+                prefix = additional_prefix + pseudomatch.group(1)
-            start, pos = pseudomatch.span(2)
+                additional_prefix = ''
-            spos = (lnum, start)
+                start, pos = pseudomatch.span(2)
-            token = pseudomatch.group(2)
+                spos = (lnum, start)
-            if token == '':
+                token = pseudomatch.group(2)
-                assert prefix
+                if token == '':
-                additional_prefix = prefix
+                    assert prefix
-                # This means that we have a line with whitespace/comments at
+                    additional_prefix = prefix
-                # the end, which just results in an endmarker.
+                    # This means that we have a line with whitespace/comments at
-                break
+                    # the end, which just results in an endmarker.
-            initial = token[0]
+                    break
                initial = token[0]
            else:
                match = whitespace.match(line, pos)
                initial = line[match.end()]
                start = match.end()
                spos = (lnum, start)
            if new_line and initial not in '\r\n\\#':
                new_line = False
@@ -539,8 +531,23 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
                    for t in dedent_if_necessary(indent_start):
                        yield t
-            if (initial in numchars or                      # ordinary number
+            if not pseudomatch:  # scan for tokens
-                    (initial == '.' and token != '.' and token != '...')):
+                match = whitespace.match(line, pos)
                if pos == 0 and paren_level == 0:
                    for t in dedent_if_necessary(match.end()):
                        yield t
                pos = match.end()
                new_line = False
                yield PythonToken(
                    ERRORTOKEN, line[pos], (lnum, pos),
                    additional_prefix + match.group(0)
                )
                additional_prefix = ''
                pos += 1
                continue
            if (initial in numchars                      # ordinary number
                    or (initial == '.' and token != '.' and token != '...')):
                yield PythonToken(NUMBER, token, spos, prefix)
            elif pseudomatch.group(3) is not None:            # ordinary name
                if token in always_break_tokens:
--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@@ -258,7 +258,7 @@ def test_token_types(code, types):
 def test_error_string():
-    t1, newline, endmarker = _get_token_list(' "\n')
+    indent, t1, newline, token, endmarker = _get_token_list(' "\n')
    assert t1.type == ERRORTOKEN
    assert t1.prefix == ' '
    assert t1.string == '"'
@@ -339,6 +339,8 @@ def test_backslash():
@pytest.mark.parametrize(
    ('code', 'types'), [
        (' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]),
        # f-strings
        ('f"', [FSTRING_START]),
        ('f""', [FSTRING_START, FSTRING_END]),
        ('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
@@ -394,7 +396,7 @@ def test_backslash():
        ]),
    ]
 )
-def test_fstring(code, types, version_ge_py36):
+def test_token_types(code, types, version_ge_py36):
    actual_types = [t.type for t in _get_token_list(code, version_ge_py36)]
    assert types + [ENDMARKER] == actual_types