diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 9bdf0c5..48d128b 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -54,8 +54,13 @@ if py_version >= 30: # Python 3 has str.isidentifier() to check if a char is a valid identifier is_identifier = str.isidentifier else: - namechars = string.ascii_letters + '_' - is_identifier = lambda s: s in namechars + # Python 2 doesn't, but it's not that important anymore and if you tokenize + # Python 2 code with this, it's still ok. It's just that parsing Python 3 + # code with this function is not 100% correct. + # This just means that Python 2 code matches a few identifiers too much, + # but that doesn't really matter. + def is_identifier(s): + return True def group(*choices, **kwargs): @@ -135,7 +140,11 @@ def _create_token_collection(version_info): Comment = r'#[^\r\n]*' # Python 2 is pretty much not working properly anymore, we just ignore # parsing unicode properly, which is fine, I guess. - if version_info[0] < 3 or sys.version_info[0] == 2: + if version_info[0] == 2: + Name = r'([A-Za-z_0-9]+)' + elif sys.version_info[0] == 2: + # Unfortunately the regex engine cannot deal with the regex below, so + # just use this one. Name = r'(\w+)' else: Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)' diff --git a/test/test_diff_parser.py b/test/test_diff_parser.py index 99c9327..04e1ca0 100644 --- a/test/test_diff_parser.py +++ b/test/test_diff_parser.py @@ -974,10 +974,12 @@ def test_random_unicode_characters(differ): Those issues were all found with the fuzzer. """ differ.initialize('') - differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1, expect_error_leaves=True) + differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1, + expect_error_leaves=True) differ.parse(u'\r\r', parsers=1) differ.parse(u"˟Ę\x05À\r rúƣ@\x8a\x15r()\n", parsers=1, expect_error_leaves=True) - differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1) + differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1, + expect_error_leaves=sys.version_info[0] == 2) s = ' if not (self, "_fi\x02\x0e\x08\n\nle"):' differ.parse(s, parsers=1, expect_error_leaves=True) differ.parse('') diff --git a/test/test_tokenize.py b/test/test_tokenize.py index 8d8f272..bf703dc 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -142,7 +142,7 @@ def test_identifier_contains_unicode(): else: # Unicode tokens in Python 2 seem to be identified as operators. # They will be ignored in the parser, that's ok. - assert unicode_token[0] == OP + assert unicode_token[0] == ERRORTOKEN def test_quoted_strings():