mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-22 04:01:32 +08:00
Fixed name tokenizing issues for tamil characters, fixes davidhalter/jedi#1368
This commit is contained in:
@@ -23,6 +23,9 @@ from parso._compatibility import py_version
|
||||
from parso.utils import split_lines
|
||||
|
||||
|
||||
# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
|
||||
MAX_UNICODE = '\U0010ffff'
|
||||
|
||||
STRING = PythonTokenTypes.STRING
|
||||
NAME = PythonTokenTypes.NAME
|
||||
NUMBER = PythonTokenTypes.NUMBER
|
||||
@@ -130,7 +133,12 @@ def _create_token_collection(version_info):
|
||||
Whitespace = r'[ \f\t]*'
|
||||
whitespace = _compile(Whitespace)
|
||||
Comment = r'#[^\r\n]*'
|
||||
Name = r'\w+'
|
||||
# Python 2 is pretty much not working properly anymore, we just ignore
|
||||
# parsing unicode properly, which is fine, I guess.
|
||||
if version_info[0] < 3 or sys.version_info[0] == 2:
|
||||
Name = r'(\w+)'
|
||||
else:
|
||||
Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'
|
||||
|
||||
if version_info >= (3, 6):
|
||||
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
|
||||
@@ -510,6 +518,24 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
||||
if (initial in numchars or # ordinary number
|
||||
(initial == '.' and token != '.' and token != '...')):
|
||||
yield PythonToken(NUMBER, token, spos, prefix)
|
||||
elif pseudomatch.group(3) is not None: # ordinary name
|
||||
if token in always_break_tokens:
|
||||
fstring_stack[:] = []
|
||||
paren_level = 0
|
||||
# We only want to dedent if the token is on a new line.
|
||||
if re.match(r'[ \f\t]*$', line[:start]):
|
||||
while True:
|
||||
indent = indents.pop()
|
||||
if indent > start:
|
||||
yield PythonToken(DEDENT, '', spos, '')
|
||||
else:
|
||||
indents.append(indent)
|
||||
break
|
||||
if is_identifier(token):
|
||||
yield PythonToken(NAME, token, spos, prefix)
|
||||
else:
|
||||
for t in _split_illegal_unicode_name(token, spos, prefix):
|
||||
yield t # yield from Python 2
|
||||
elif initial in '\r\n':
|
||||
if any(not f.allow_multiline() for f in fstring_stack):
|
||||
# Would use fstring_stack.clear, but that's not available
|
||||
@@ -564,20 +590,6 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
||||
elif token in fstring_pattern_map: # The start of an fstring.
|
||||
fstring_stack.append(FStringNode(fstring_pattern_map[token]))
|
||||
yield PythonToken(FSTRING_START, token, spos, prefix)
|
||||
elif is_identifier(initial): # ordinary name
|
||||
if token in always_break_tokens:
|
||||
fstring_stack[:] = []
|
||||
paren_level = 0
|
||||
# We only want to dedent if the token is on a new line.
|
||||
if re.match(r'[ \f\t]*$', line[:start]):
|
||||
while True:
|
||||
indent = indents.pop()
|
||||
if indent > start:
|
||||
yield PythonToken(DEDENT, '', spos, '')
|
||||
else:
|
||||
indents.append(indent)
|
||||
break
|
||||
yield PythonToken(NAME, token, spos, prefix)
|
||||
elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt
|
||||
additional_prefix += prefix + line[start:]
|
||||
break
|
||||
@@ -613,6 +625,37 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
||||
yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)
|
||||
|
||||
|
||||
def _split_illegal_unicode_name(token, start_pos, prefix):
|
||||
def create_token():
|
||||
return PythonToken(
|
||||
ERRORTOKEN if is_illegal else NAME, found,
|
||||
(start_pos[0], start_pos[1] + i), prefix
|
||||
)
|
||||
|
||||
found = ''
|
||||
is_illegal = False
|
||||
for i, char in enumerate(token):
|
||||
if is_illegal:
|
||||
if is_identifier(char):
|
||||
yield create_token()
|
||||
found = char
|
||||
is_illegal = False
|
||||
else:
|
||||
found += char
|
||||
else:
|
||||
new_found = found + char
|
||||
if is_identifier(new_found):
|
||||
found = new_found
|
||||
else:
|
||||
if found:
|
||||
yield create_token()
|
||||
found = char
|
||||
is_illegal = True
|
||||
|
||||
if found:
|
||||
yield create_token()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) >= 2:
|
||||
path = sys.argv[1]
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 # This file contains Unicode characters.
|
||||
|
||||
import sys
|
||||
from textwrap import dedent
|
||||
|
||||
import pytest
|
||||
@@ -14,6 +15,7 @@ from parso.python.tokenize import PythonToken
|
||||
|
||||
# To make it easier to access some of the token types, just put them here.
|
||||
NAME = PythonTokenTypes.NAME
|
||||
NUMBER = PythonTokenTypes.NUMBER
|
||||
NEWLINE = PythonTokenTypes.NEWLINE
|
||||
STRING = PythonTokenTypes.STRING
|
||||
INDENT = PythonTokenTypes.INDENT
|
||||
@@ -228,16 +230,29 @@ def test_endmarker_end_pos():
|
||||
check('a\\')
|
||||
|
||||
|
||||
xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
('code', 'types'), [
|
||||
# Indentation
|
||||
(' foo', [INDENT, NAME, DEDENT]),
|
||||
(' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
|
||||
(' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
|
||||
NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
|
||||
(' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
|
||||
|
||||
# Name stuff
|
||||
('1foo1', [NUMBER, NAME]),
|
||||
pytest.param(
|
||||
u'மெல்லினம்', [NAME],
|
||||
**xfail_py2),
|
||||
pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
|
||||
pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
|
||||
pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
|
||||
]
|
||||
)
|
||||
def test_indentation(code, types):
|
||||
def test_token_types(code, types):
|
||||
actual_types = [t.type for t in _get_token_list(code)]
|
||||
assert actual_types == types + [ENDMARKER]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user