diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 5b70d94..510cdb3 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -23,6 +23,9 @@ from parso._compatibility import py_version from parso.utils import split_lines +# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) +MAX_UNICODE = '\U0010ffff' + STRING = PythonTokenTypes.STRING NAME = PythonTokenTypes.NAME NUMBER = PythonTokenTypes.NUMBER @@ -130,7 +133,12 @@ def _create_token_collection(version_info): Whitespace = r'[ \f\t]*' whitespace = _compile(Whitespace) Comment = r'#[^\r\n]*' - Name = r'\w+' + # Python 2 is pretty much not working properly anymore, we just ignore + # parsing unicode properly, which is fine, I guess. + if version_info[0] < 3 or sys.version_info[0] == 2: + Name = r'(\w+)' + else: + Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)' if version_info >= (3, 6): Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' @@ -510,6 +518,24 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): if (initial in numchars or # ordinary number (initial == '.' and token != '.' and token != '...')): yield PythonToken(NUMBER, token, spos, prefix) + elif pseudomatch.group(3) is not None: # ordinary name + if token in always_break_tokens: + fstring_stack[:] = [] + paren_level = 0 + # We only want to dedent if the token is on a new line. + if re.match(r'[ \f\t]*$', line[:start]): + while True: + indent = indents.pop() + if indent > start: + yield PythonToken(DEDENT, '', spos, '') + else: + indents.append(indent) + break + if is_identifier(token): + yield PythonToken(NAME, token, spos, prefix) + else: + for t in _split_illegal_unicode_name(token, spos, prefix): + yield t # yield from Python 2 elif initial in '\r\n': if any(not f.allow_multiline() for f in fstring_stack): # Would use fstring_stack.clear, but that's not available @@ -564,20 +590,6 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): elif token in fstring_pattern_map: # The start of an fstring. fstring_stack.append(FStringNode(fstring_pattern_map[token])) yield PythonToken(FSTRING_START, token, spos, prefix) - elif is_identifier(initial): # ordinary name - if token in always_break_tokens: - fstring_stack[:] = [] - paren_level = 0 - # We only want to dedent if the token is on a new line. - if re.match(r'[ \f\t]*$', line[:start]): - while True: - indent = indents.pop() - if indent > start: - yield PythonToken(DEDENT, '', spos, '') - else: - indents.append(indent) - break - yield PythonToken(NAME, token, spos, prefix) elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt additional_prefix += prefix + line[start:] break @@ -613,6 +625,37 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): yield PythonToken(ENDMARKER, '', end_pos, additional_prefix) +def _split_illegal_unicode_name(token, start_pos, prefix): + def create_token(): + return PythonToken( + ERRORTOKEN if is_illegal else NAME, found, + (start_pos[0], start_pos[1] + i), prefix + ) + + found = '' + is_illegal = False + for i, char in enumerate(token): + if is_illegal: + if is_identifier(char): + yield create_token() + found = char + is_illegal = False + else: + found += char + else: + new_found = found + char + if is_identifier(new_found): + found = new_found + else: + if found: + yield create_token() + found = char + is_illegal = True + + if found: + yield create_token() + + if __name__ == "__main__": if len(sys.argv) >= 2: path = sys.argv[1] diff --git a/test/test_tokenize.py b/test/test_tokenize.py index db96513..7d99a7b 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 # This file contains Unicode characters. +import sys from textwrap import dedent import pytest @@ -14,6 +15,7 @@ from parso.python.tokenize import PythonToken # To make it easier to access some of the token types, just put them here. NAME = PythonTokenTypes.NAME +NUMBER = PythonTokenTypes.NUMBER NEWLINE = PythonTokenTypes.NEWLINE STRING = PythonTokenTypes.STRING INDENT = PythonTokenTypes.INDENT @@ -228,16 +230,29 @@ def test_endmarker_end_pos(): check('a\\') +xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')]) + + @pytest.mark.parametrize( ('code', 'types'), [ + # Indentation (' foo', [INDENT, NAME, DEDENT]), (' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]), (' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]), (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]), + + # Name stuff + ('1foo1', [NUMBER, NAME]), + pytest.param( + u'மெல்லினம்', [NAME], + **xfail_py2), + pytest.param(u'²', [ERRORTOKEN], **xfail_py2), + pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2), + pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2), ] ) -def test_indentation(code, types): +def test_token_types(code, types): actual_types = [t.type for t in _get_token_list(code)] assert actual_types == types + [ENDMARKER]