Fixed name tokenizing issues for tamil characters, fixes davidhalter/jedi#1368

2025-12-21 20:01:21 +08:00 · 2019-07-12 21:31:49 +02:00
parent 19de3eb5ca
commit 9501b0bde0
2 changed files with 74 additions and 16 deletions
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -23,6 +23,9 @@ from parso._compatibility import py_version
 from parso.utils import split_lines
 # Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
 MAX_UNICODE = '\U0010ffff'
 STRING = PythonTokenTypes.STRING
 NAME = PythonTokenTypes.NAME
 NUMBER = PythonTokenTypes.NUMBER
@@ -130,7 +133,12 @@ def _create_token_collection(version_info):
    Whitespace = r'[ \f\t]*'
    whitespace = _compile(Whitespace)
    Comment = r'#[^\r\n]*'
-    Name = r'\w+'
+    # Python 2 is pretty much not working properly anymore, we just ignore
    # parsing unicode properly, which is fine, I guess.
    if version_info[0] < 3 or sys.version_info[0] == 2:
        Name = r'(\w+)'
    else:
        Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'
    if version_info >= (3, 6):
        Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
@@ -510,6 +518,24 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
            if (initial in numchars or                      # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
                yield PythonToken(NUMBER, token, spos, prefix)
            elif pseudomatch.group(3) is not None:            # ordinary name
                if token in always_break_tokens:
                    fstring_stack[:] = []
                    paren_level = 0
                    # We only want to dedent if the token is on a new line.
                    if re.match(r'[ \f\t]*$', line[:start]):
                        while True:
                            indent = indents.pop()
                            if indent > start:
                                yield PythonToken(DEDENT, '', spos, '')
                            else:
                                indents.append(indent)
                                break
                if is_identifier(token):
                    yield PythonToken(NAME, token, spos, prefix)
                else:
                    for t in _split_illegal_unicode_name(token, spos, prefix):
                        yield t  # yield from Python 2
            elif initial in '\r\n':
                if any(not f.allow_multiline() for f in fstring_stack):
                    # Would use fstring_stack.clear, but that's not available
@@ -564,20 +590,6 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
            elif token in fstring_pattern_map:  # The start of an fstring.
                fstring_stack.append(FStringNode(fstring_pattern_map[token]))
                yield PythonToken(FSTRING_START, token, spos, prefix)
            elif is_identifier(initial):                      # ordinary name
                if token in always_break_tokens:
                    fstring_stack[:] = []
                    paren_level = 0
                    # We only want to dedent if the token is on a new line.
                    if re.match(r'[ \f\t]*$', line[:start]):
                        while True:
                            indent = indents.pop()
                            if indent > start:
                                yield PythonToken(DEDENT, '', spos, '')
                            else:
                                indents.append(indent)
                                break
                yield PythonToken(NAME, token, spos, prefix)
            elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'):  # continued stmt
                additional_prefix += prefix + line[start:]
                break
@@ -613,6 +625,37 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
    yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)
 def _split_illegal_unicode_name(token, start_pos, prefix):
    def create_token():
        return PythonToken(
            ERRORTOKEN if is_illegal else NAME, found,
            (start_pos[0], start_pos[1] + i), prefix
        )
    found = ''
    is_illegal = False
    for i, char in enumerate(token):
        if is_illegal:
            if is_identifier(char):
                yield create_token()
                found = char
                is_illegal = False
            else:
                found += char
        else:
            new_found = found + char
            if is_identifier(new_found):
                found = new_found
            else:
                if found:
                    yield create_token()
                found = char
                is_illegal = True
    if found:
        yield create_token()
 if __name__ == "__main__":
    if len(sys.argv) >= 2:
        path = sys.argv[1]
--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8    # This file contains Unicode characters.
 import sys
 from textwrap import dedent
 import pytest
@@ -14,6 +15,7 @@ from parso.python.tokenize import PythonToken
 # To make it easier to access some of the token types, just put them here.
 NAME = PythonTokenTypes.NAME
 NUMBER = PythonTokenTypes.NUMBER
 NEWLINE = PythonTokenTypes.NEWLINE
 STRING = PythonTokenTypes.STRING
 INDENT = PythonTokenTypes.INDENT
@@ -228,16 +230,29 @@ def test_endmarker_end_pos():
    check('a\\')
 xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
@pytest.mark.parametrize(
    ('code', 'types'), [
        # Indentation
        (' foo', [INDENT, NAME, DEDENT]),
        ('  foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        ('  foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
                                NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
        # Name stuff
        ('1foo1', [NUMBER, NAME]),
        pytest.param(
            u'மெல்லினம்', [NAME],
            **xfail_py2),
        pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
        pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
        pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
    ]
 )
-def test_indentation(code, types):
+def test_token_types(code, types):
    actual_types = [t.type for t in _get_token_list(code)]
    assert actual_types == types + [ENDMARKER]