Fix a prefix issue with error leafs.

2025-12-08 05:34:51 +08:00 · 2018-04-22 19:28:30 +02:00
parent 579146b501
commit f20106d88e
2 changed files with 21 additions and 6 deletions
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -28,7 +28,8 @@ from parso.utils import split_lines

 TokenCollection = namedtuple(
    'TokenCollection',
-    'pseudo_token single_quoted triple_quoted endpats fstring_pattern_map always_break_tokens',
+    'pseudo_token single_quoted triple_quoted endpats whitespace '
+    'fstring_pattern_map always_break_tokens',
 )

 BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
@@ -114,6 +115,7 @@ def _create_token_collection(version_info):
    # Note: we use unicode matching for names ("\w") but ascii matching for
    # number literals.
    Whitespace = r'[ \f\t]*'
+    whitespace = _compile(Whitespace)
    Comment = r'#[^\r\n]*'
    Name = r'\w+'

@@ -225,7 +227,7 @@ def _create_token_collection(version_info):
    pseudo_token_compiled = _compile(PseudoToken)
    return TokenCollection(
        pseudo_token_compiled, single_quoted, triple_quoted, endpats,
-        fstring_pattern_map, ALWAYS_BREAK_TOKENS
+        whitespace, fstring_pattern_map, ALWAYS_BREAK_TOKENS
    )


@@ -354,7 +356,8 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
    token. This idea comes from lib2to3. The prefix contains all information
    that is irrelevant for the parser like newlines in parentheses or comments.
    """
-    pseudo_token, single_quoted, triple_quoted, endpats, fstring_pattern_map, always_break_tokens, = \
+    pseudo_token, single_quoted, triple_quoted, endpats, whitespace, \
+        fstring_pattern_map, always_break_tokens, = \
        _get_token_collection(version_info)
    paren_level = 0  # count parentheses
    indents = [0]
@@ -435,10 +438,14 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):

            pseudomatch = pseudo_token.match(line, pos)
            if not pseudomatch:                             # scan for tokens
-                txt = line[pos:]
-                if txt.endswith('\n'):
+                if line.endswith('\n'):
                    new_line = True
-                yield PythonToken(ERRORTOKEN, txt, (lnum, pos), additional_prefix)
+                match = whitespace.match(line, pos)
+                pos = match.end()
+                yield PythonToken(
+                    ERRORTOKEN, line[pos:], (lnum, pos),
+                    additional_prefix + match.group(0)
+                )
                additional_prefix = ''
                break

--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@@ -227,3 +227,11 @@ def test_endmarker_end_pos():
 def test_indentation(code, types):
    actual_types = [t.type for t in _get_token_list(code)]
    assert actual_types == types + [ENDMARKER]
+
+
+def test_error_string():
+    t1, endmarker = _get_token_list(' "\n')
+    assert t1.type == ERRORTOKEN
+    assert t1.prefix == ' '
+    assert t1.string == '"\n'
+    assert endmarker.string == ''