Fixed name tokenizing issues for tamil characters, fixes davidhalter/jedi#1368

2025-12-22 04:01:32 +08:00 · 2019-07-12 21:31:49 +02:00
parent 19de3eb5ca
commit 9501b0bde0
2 changed files with 74 additions and 16 deletions
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -23,6 +23,9 @@ from parso._compatibility import py_version
 from parso.utils import split_lines


+# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
+MAX_UNICODE = '\U0010ffff'
+
 STRING = PythonTokenTypes.STRING
 NAME = PythonTokenTypes.NAME
 NUMBER = PythonTokenTypes.NUMBER
@@ -130,7 +133,12 @@ def _create_token_collection(version_info):
    Whitespace = r'[ \f\t]*'
    whitespace = _compile(Whitespace)
    Comment = r'#[^\r\n]*'
-    Name = r'\w+'
+    # Python 2 is pretty much not working properly anymore, we just ignore
+    # parsing unicode properly, which is fine, I guess.
+    if version_info[0] < 3 or sys.version_info[0] == 2:
+        Name = r'(\w+)'
+    else:
+        Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'

    if version_info >= (3, 6):
        Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
@@ -510,6 +518,24 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
            if (initial in numchars or                      # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
                yield PythonToken(NUMBER, token, spos, prefix)
+            elif pseudomatch.group(3) is not None:            # ordinary name
+                if token in always_break_tokens:
+                    fstring_stack[:] = []
+                    paren_level = 0
+                    # We only want to dedent if the token is on a new line.
+                    if re.match(r'[ \f\t]*$', line[:start]):
+                        while True:
+                            indent = indents.pop()
+                            if indent > start:
+                                yield PythonToken(DEDENT, '', spos, '')
+                            else:
+                                indents.append(indent)
+                                break
+                if is_identifier(token):
+                    yield PythonToken(NAME, token, spos, prefix)
+                else:
+                    for t in _split_illegal_unicode_name(token, spos, prefix):
+                        yield t  # yield from Python 2
            elif initial in '\r\n':
                if any(not f.allow_multiline() for f in fstring_stack):
                    # Would use fstring_stack.clear, but that's not available
@@ -564,20 +590,6 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
            elif token in fstring_pattern_map:  # The start of an fstring.
                fstring_stack.append(FStringNode(fstring_pattern_map[token]))
                yield PythonToken(FSTRING_START, token, spos, prefix)
-            elif is_identifier(initial):                      # ordinary name
-                if token in always_break_tokens:
-                    fstring_stack[:] = []
-                    paren_level = 0
-                    # We only want to dedent if the token is on a new line.
-                    if re.match(r'[ \f\t]*$', line[:start]):
-                        while True:
-                            indent = indents.pop()
-                            if indent > start:
-                                yield PythonToken(DEDENT, '', spos, '')
-                            else:
-                                indents.append(indent)
-                                break
-                yield PythonToken(NAME, token, spos, prefix)
            elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'):  # continued stmt
                additional_prefix += prefix + line[start:]
                break
@@ -613,6 +625,37 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
    yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)


+def _split_illegal_unicode_name(token, start_pos, prefix):
+    def create_token():
+        return PythonToken(
+            ERRORTOKEN if is_illegal else NAME, found,
+            (start_pos[0], start_pos[1] + i), prefix
+        )
+
+    found = ''
+    is_illegal = False
+    for i, char in enumerate(token):
+        if is_illegal:
+            if is_identifier(char):
+                yield create_token()
+                found = char
+                is_illegal = False
+            else:
+                found += char
+        else:
+            new_found = found + char
+            if is_identifier(new_found):
+                found = new_found
+            else:
+                if found:
+                    yield create_token()
+                found = char
+                is_illegal = True
+
+    if found:
+        yield create_token()
+
+
 if __name__ == "__main__":
    if len(sys.argv) >= 2:
        path = sys.argv[1]
--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8    # This file contains Unicode characters.

+import sys
 from textwrap import dedent

 import pytest
@@ -14,6 +15,7 @@ from parso.python.tokenize import PythonToken

 # To make it easier to access some of the token types, just put them here.
 NAME = PythonTokenTypes.NAME
+NUMBER = PythonTokenTypes.NUMBER
 NEWLINE = PythonTokenTypes.NEWLINE
 STRING = PythonTokenTypes.STRING
 INDENT = PythonTokenTypes.INDENT
@@ -228,16 +230,29 @@ def test_endmarker_end_pos():
    check('a\\')


+xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
+
+
@pytest.mark.parametrize(
    ('code', 'types'), [
+        # Indentation
        (' foo', [INDENT, NAME, DEDENT]),
        ('  foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        ('  foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
                                NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
+
+        # Name stuff
+        ('1foo1', [NUMBER, NAME]),
+        pytest.param(
+            u'மெல்லினம்', [NAME],
+            **xfail_py2),
+        pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
+        pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
+        pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
    ]
 )
-def test_indentation(code, types):
+def test_token_types(code, types):
    actual_types = [t.type for t in _get_token_list(code)]
    assert actual_types == types + [ENDMARKER]