Fixed name tokenizing issues for tamil characters, fixes davidhalter/jedi#1368

This commit is contained in:
Dave Halter
2019-07-12 21:31:49 +02:00
parent 19de3eb5ca
commit 9501b0bde0
2 changed files with 74 additions and 16 deletions

View File

@@ -23,6 +23,9 @@ from parso._compatibility import py_version
from parso.utils import split_lines from parso.utils import split_lines
# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
MAX_UNICODE = '\U0010ffff'
STRING = PythonTokenTypes.STRING STRING = PythonTokenTypes.STRING
NAME = PythonTokenTypes.NAME NAME = PythonTokenTypes.NAME
NUMBER = PythonTokenTypes.NUMBER NUMBER = PythonTokenTypes.NUMBER
@@ -130,7 +133,12 @@ def _create_token_collection(version_info):
Whitespace = r'[ \f\t]*' Whitespace = r'[ \f\t]*'
whitespace = _compile(Whitespace) whitespace = _compile(Whitespace)
Comment = r'#[^\r\n]*' Comment = r'#[^\r\n]*'
Name = r'\w+' # Python 2 is pretty much not working properly anymore, we just ignore
# parsing unicode properly, which is fine, I guess.
if version_info[0] < 3 or sys.version_info[0] == 2:
Name = r'(\w+)'
else:
Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'
if version_info >= (3, 6): if version_info >= (3, 6):
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
@@ -510,6 +518,24 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
if (initial in numchars or # ordinary number if (initial in numchars or # ordinary number
(initial == '.' and token != '.' and token != '...')): (initial == '.' and token != '.' and token != '...')):
yield PythonToken(NUMBER, token, spos, prefix) yield PythonToken(NUMBER, token, spos, prefix)
elif pseudomatch.group(3) is not None: # ordinary name
if token in always_break_tokens:
fstring_stack[:] = []
paren_level = 0
# We only want to dedent if the token is on a new line.
if re.match(r'[ \f\t]*$', line[:start]):
while True:
indent = indents.pop()
if indent > start:
yield PythonToken(DEDENT, '', spos, '')
else:
indents.append(indent)
break
if is_identifier(token):
yield PythonToken(NAME, token, spos, prefix)
else:
for t in _split_illegal_unicode_name(token, spos, prefix):
yield t # yield from Python 2
elif initial in '\r\n': elif initial in '\r\n':
if any(not f.allow_multiline() for f in fstring_stack): if any(not f.allow_multiline() for f in fstring_stack):
# Would use fstring_stack.clear, but that's not available # Would use fstring_stack.clear, but that's not available
@@ -564,20 +590,6 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
elif token in fstring_pattern_map: # The start of an fstring. elif token in fstring_pattern_map: # The start of an fstring.
fstring_stack.append(FStringNode(fstring_pattern_map[token])) fstring_stack.append(FStringNode(fstring_pattern_map[token]))
yield PythonToken(FSTRING_START, token, spos, prefix) yield PythonToken(FSTRING_START, token, spos, prefix)
elif is_identifier(initial): # ordinary name
if token in always_break_tokens:
fstring_stack[:] = []
paren_level = 0
# We only want to dedent if the token is on a new line.
if re.match(r'[ \f\t]*$', line[:start]):
while True:
indent = indents.pop()
if indent > start:
yield PythonToken(DEDENT, '', spos, '')
else:
indents.append(indent)
break
yield PythonToken(NAME, token, spos, prefix)
elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt
additional_prefix += prefix + line[start:] additional_prefix += prefix + line[start:]
break break
@@ -613,6 +625,37 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
yield PythonToken(ENDMARKER, '', end_pos, additional_prefix) yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)
def _split_illegal_unicode_name(token, start_pos, prefix):
def create_token():
return PythonToken(
ERRORTOKEN if is_illegal else NAME, found,
(start_pos[0], start_pos[1] + i), prefix
)
found = ''
is_illegal = False
for i, char in enumerate(token):
if is_illegal:
if is_identifier(char):
yield create_token()
found = char
is_illegal = False
else:
found += char
else:
new_found = found + char
if is_identifier(new_found):
found = new_found
else:
if found:
yield create_token()
found = char
is_illegal = True
if found:
yield create_token()
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) >= 2: if len(sys.argv) >= 2:
path = sys.argv[1] path = sys.argv[1]

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 # This file contains Unicode characters. # -*- coding: utf-8 # This file contains Unicode characters.
import sys
from textwrap import dedent from textwrap import dedent
import pytest import pytest
@@ -14,6 +15,7 @@ from parso.python.tokenize import PythonToken
# To make it easier to access some of the token types, just put them here. # To make it easier to access some of the token types, just put them here.
NAME = PythonTokenTypes.NAME NAME = PythonTokenTypes.NAME
NUMBER = PythonTokenTypes.NUMBER
NEWLINE = PythonTokenTypes.NEWLINE NEWLINE = PythonTokenTypes.NEWLINE
STRING = PythonTokenTypes.STRING STRING = PythonTokenTypes.STRING
INDENT = PythonTokenTypes.INDENT INDENT = PythonTokenTypes.INDENT
@@ -228,16 +230,29 @@ def test_endmarker_end_pos():
check('a\\') check('a\\')
xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
@pytest.mark.parametrize( @pytest.mark.parametrize(
('code', 'types'), [ ('code', 'types'), [
# Indentation
(' foo', [INDENT, NAME, DEDENT]), (' foo', [INDENT, NAME, DEDENT]),
(' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]), (' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
(' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, (' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
NEWLINE, ERROR_DEDENT, NAME, DEDENT]), NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
(' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]), (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
# Name stuff
('1foo1', [NUMBER, NAME]),
pytest.param(
u'மெல்லினம்', [NAME],
**xfail_py2),
pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
] ]
) )
def test_indentation(code, types): def test_token_types(code, types):
actual_types = [t.type for t in _get_token_list(code)] actual_types = [t.type for t in _get_token_list(code)]
assert actual_types == types + [ENDMARKER] assert actual_types == types + [ENDMARKER]