Fix name tokenizing for Python 2

2025-12-06 21:04:29 +08:00 · 2019-07-13 15:34:23 +02:00
parent 2b8544021f
commit 0a5b5f3346
3 changed files with 17 additions and 6 deletions
--- a/parso/python/tokenize.py
+++ b/parso/python/tokenize.py
@@ -54,8 +54,13 @@ if py_version >= 30:
    # Python 3 has str.isidentifier() to check if a char is a valid identifier
    is_identifier = str.isidentifier
 else:
-    namechars = string.ascii_letters + '_'
-    is_identifier = lambda s: s in namechars
+    # Python 2 doesn't, but it's not that important anymore and if you tokenize
+    # Python 2 code with this, it's still ok. It's just that parsing Python 3
+    # code with this function is not 100% correct.
+    # This just means that Python 2 code matches a few identifiers too much,
+    # but that doesn't really matter.
+    def is_identifier(s):
+        return True


 def group(*choices, **kwargs):
@@ -135,7 +140,11 @@ def _create_token_collection(version_info):
    Comment = r'#[^\r\n]*'
    # Python 2 is pretty much not working properly anymore, we just ignore
    # parsing unicode properly, which is fine, I guess.
-    if version_info[0] < 3 or sys.version_info[0] == 2:
+    if version_info[0] == 2:
+        Name = r'([A-Za-z_0-9]+)'
+    elif sys.version_info[0] == 2:
+        # Unfortunately the regex engine cannot deal with the regex below, so
+        # just use this one.
        Name = r'(\w+)'
    else:
        Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'
--- a/test/test_diff_parser.py
+++ b/test/test_diff_parser.py
@@ -974,10 +974,12 @@ def test_random_unicode_characters(differ):
    Those issues were all found with the fuzzer.
    """
    differ.initialize('')
-    differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1, expect_error_leaves=True)
+    differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1,
+                 expect_error_leaves=True)
    differ.parse(u'\r\r', parsers=1)
    differ.parse(u"˟Ę\x05À\r   rúƣ@\x8a\x15r()\n", parsers=1, expect_error_leaves=True)
-    differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1)
+    differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1,
+                 expect_error_leaves=sys.version_info[0] == 2)
    s = '        if not (self, "_fi\x02\x0e\x08\n\nle"):'
    differ.parse(s, parsers=1, expect_error_leaves=True)
    differ.parse('')
--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@@ -142,7 +142,7 @@ def test_identifier_contains_unicode():
    else:
        # Unicode tokens in Python 2 seem to be identified as operators.
        # They will be ignored in the parser, that's ok.
-        assert unicode_token[0] == OP
+        assert unicode_token[0] == ERRORTOKEN


 def test_quoted_strings():