mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-21 20:01:21 +08:00
Fixed name tokenizing issues for tamil characters, fixes davidhalter/jedi#1368
This commit is contained in:
@@ -23,6 +23,9 @@ from parso._compatibility import py_version
|
|||||||
from parso.utils import split_lines
|
from parso.utils import split_lines
|
||||||
|
|
||||||
|
|
||||||
|
# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
|
||||||
|
MAX_UNICODE = '\U0010ffff'
|
||||||
|
|
||||||
STRING = PythonTokenTypes.STRING
|
STRING = PythonTokenTypes.STRING
|
||||||
NAME = PythonTokenTypes.NAME
|
NAME = PythonTokenTypes.NAME
|
||||||
NUMBER = PythonTokenTypes.NUMBER
|
NUMBER = PythonTokenTypes.NUMBER
|
||||||
@@ -130,7 +133,12 @@ def _create_token_collection(version_info):
|
|||||||
Whitespace = r'[ \f\t]*'
|
Whitespace = r'[ \f\t]*'
|
||||||
whitespace = _compile(Whitespace)
|
whitespace = _compile(Whitespace)
|
||||||
Comment = r'#[^\r\n]*'
|
Comment = r'#[^\r\n]*'
|
||||||
Name = r'\w+'
|
# Python 2 is pretty much not working properly anymore, we just ignore
|
||||||
|
# parsing unicode properly, which is fine, I guess.
|
||||||
|
if version_info[0] < 3 or sys.version_info[0] == 2:
|
||||||
|
Name = r'(\w+)'
|
||||||
|
else:
|
||||||
|
Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'
|
||||||
|
|
||||||
if version_info >= (3, 6):
|
if version_info >= (3, 6):
|
||||||
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
|
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
|
||||||
@@ -510,6 +518,24 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
if (initial in numchars or # ordinary number
|
if (initial in numchars or # ordinary number
|
||||||
(initial == '.' and token != '.' and token != '...')):
|
(initial == '.' and token != '.' and token != '...')):
|
||||||
yield PythonToken(NUMBER, token, spos, prefix)
|
yield PythonToken(NUMBER, token, spos, prefix)
|
||||||
|
elif pseudomatch.group(3) is not None: # ordinary name
|
||||||
|
if token in always_break_tokens:
|
||||||
|
fstring_stack[:] = []
|
||||||
|
paren_level = 0
|
||||||
|
# We only want to dedent if the token is on a new line.
|
||||||
|
if re.match(r'[ \f\t]*$', line[:start]):
|
||||||
|
while True:
|
||||||
|
indent = indents.pop()
|
||||||
|
if indent > start:
|
||||||
|
yield PythonToken(DEDENT, '', spos, '')
|
||||||
|
else:
|
||||||
|
indents.append(indent)
|
||||||
|
break
|
||||||
|
if is_identifier(token):
|
||||||
|
yield PythonToken(NAME, token, spos, prefix)
|
||||||
|
else:
|
||||||
|
for t in _split_illegal_unicode_name(token, spos, prefix):
|
||||||
|
yield t # yield from Python 2
|
||||||
elif initial in '\r\n':
|
elif initial in '\r\n':
|
||||||
if any(not f.allow_multiline() for f in fstring_stack):
|
if any(not f.allow_multiline() for f in fstring_stack):
|
||||||
# Would use fstring_stack.clear, but that's not available
|
# Would use fstring_stack.clear, but that's not available
|
||||||
@@ -564,20 +590,6 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
elif token in fstring_pattern_map: # The start of an fstring.
|
elif token in fstring_pattern_map: # The start of an fstring.
|
||||||
fstring_stack.append(FStringNode(fstring_pattern_map[token]))
|
fstring_stack.append(FStringNode(fstring_pattern_map[token]))
|
||||||
yield PythonToken(FSTRING_START, token, spos, prefix)
|
yield PythonToken(FSTRING_START, token, spos, prefix)
|
||||||
elif is_identifier(initial): # ordinary name
|
|
||||||
if token in always_break_tokens:
|
|
||||||
fstring_stack[:] = []
|
|
||||||
paren_level = 0
|
|
||||||
# We only want to dedent if the token is on a new line.
|
|
||||||
if re.match(r'[ \f\t]*$', line[:start]):
|
|
||||||
while True:
|
|
||||||
indent = indents.pop()
|
|
||||||
if indent > start:
|
|
||||||
yield PythonToken(DEDENT, '', spos, '')
|
|
||||||
else:
|
|
||||||
indents.append(indent)
|
|
||||||
break
|
|
||||||
yield PythonToken(NAME, token, spos, prefix)
|
|
||||||
elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt
|
elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt
|
||||||
additional_prefix += prefix + line[start:]
|
additional_prefix += prefix + line[start:]
|
||||||
break
|
break
|
||||||
@@ -613,6 +625,37 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)
|
yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_illegal_unicode_name(token, start_pos, prefix):
|
||||||
|
def create_token():
|
||||||
|
return PythonToken(
|
||||||
|
ERRORTOKEN if is_illegal else NAME, found,
|
||||||
|
(start_pos[0], start_pos[1] + i), prefix
|
||||||
|
)
|
||||||
|
|
||||||
|
found = ''
|
||||||
|
is_illegal = False
|
||||||
|
for i, char in enumerate(token):
|
||||||
|
if is_illegal:
|
||||||
|
if is_identifier(char):
|
||||||
|
yield create_token()
|
||||||
|
found = char
|
||||||
|
is_illegal = False
|
||||||
|
else:
|
||||||
|
found += char
|
||||||
|
else:
|
||||||
|
new_found = found + char
|
||||||
|
if is_identifier(new_found):
|
||||||
|
found = new_found
|
||||||
|
else:
|
||||||
|
if found:
|
||||||
|
yield create_token()
|
||||||
|
found = char
|
||||||
|
is_illegal = True
|
||||||
|
|
||||||
|
if found:
|
||||||
|
yield create_token()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) >= 2:
|
if len(sys.argv) >= 2:
|
||||||
path = sys.argv[1]
|
path = sys.argv[1]
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 # This file contains Unicode characters.
|
# -*- coding: utf-8 # This file contains Unicode characters.
|
||||||
|
|
||||||
|
import sys
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@@ -14,6 +15,7 @@ from parso.python.tokenize import PythonToken
|
|||||||
|
|
||||||
# To make it easier to access some of the token types, just put them here.
|
# To make it easier to access some of the token types, just put them here.
|
||||||
NAME = PythonTokenTypes.NAME
|
NAME = PythonTokenTypes.NAME
|
||||||
|
NUMBER = PythonTokenTypes.NUMBER
|
||||||
NEWLINE = PythonTokenTypes.NEWLINE
|
NEWLINE = PythonTokenTypes.NEWLINE
|
||||||
STRING = PythonTokenTypes.STRING
|
STRING = PythonTokenTypes.STRING
|
||||||
INDENT = PythonTokenTypes.INDENT
|
INDENT = PythonTokenTypes.INDENT
|
||||||
@@ -228,16 +230,29 @@ def test_endmarker_end_pos():
|
|||||||
check('a\\')
|
check('a\\')
|
||||||
|
|
||||||
|
|
||||||
|
xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
('code', 'types'), [
|
('code', 'types'), [
|
||||||
|
# Indentation
|
||||||
(' foo', [INDENT, NAME, DEDENT]),
|
(' foo', [INDENT, NAME, DEDENT]),
|
||||||
(' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
|
(' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
|
||||||
(' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
|
(' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
|
||||||
NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
|
NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
|
||||||
(' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
|
(' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
|
||||||
|
|
||||||
|
# Name stuff
|
||||||
|
('1foo1', [NUMBER, NAME]),
|
||||||
|
pytest.param(
|
||||||
|
u'மெல்லினம்', [NAME],
|
||||||
|
**xfail_py2),
|
||||||
|
pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
|
||||||
|
pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
|
||||||
|
pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_indentation(code, types):
|
def test_token_types(code, types):
|
||||||
actual_types = [t.type for t in _get_token_list(code)]
|
actual_types = [t.type for t in _get_token_list(code)]
|
||||||
assert actual_types == types + [ENDMARKER]
|
assert actual_types == types + [ENDMARKER]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user