Fix name tokenizing for Python 2

This commit is contained in:
Dave Halter
2019-07-13 15:34:23 +02:00
parent 2b8544021f
commit 0a5b5f3346
3 changed files with 17 additions and 6 deletions

View File

@@ -54,8 +54,13 @@ if py_version >= 30:
# Python 3 has str.isidentifier() to check if a char is a valid identifier # Python 3 has str.isidentifier() to check if a char is a valid identifier
is_identifier = str.isidentifier is_identifier = str.isidentifier
else: else:
namechars = string.ascii_letters + '_' # Python 2 doesn't, but it's not that important anymore and if you tokenize
is_identifier = lambda s: s in namechars # Python 2 code with this, it's still ok. It's just that parsing Python 3
# code with this function is not 100% correct.
# This just means that Python 2 code matches a few identifiers too much,
# but that doesn't really matter.
def is_identifier(s):
return True
def group(*choices, **kwargs): def group(*choices, **kwargs):
@@ -135,7 +140,11 @@ def _create_token_collection(version_info):
Comment = r'#[^\r\n]*' Comment = r'#[^\r\n]*'
# Python 2 is pretty much not working properly anymore, we just ignore # Python 2 is pretty much not working properly anymore, we just ignore
# parsing unicode properly, which is fine, I guess. # parsing unicode properly, which is fine, I guess.
if version_info[0] < 3 or sys.version_info[0] == 2: if version_info[0] == 2:
Name = r'([A-Za-z_0-9]+)'
elif sys.version_info[0] == 2:
# Unfortunately the regex engine cannot deal with the regex below, so
# just use this one.
Name = r'(\w+)' Name = r'(\w+)'
else: else:
Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)' Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'

View File

@@ -974,10 +974,12 @@ def test_random_unicode_characters(differ):
Those issues were all found with the fuzzer. Those issues were all found with the fuzzer.
""" """
differ.initialize('') differ.initialize('')
differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1, expect_error_leaves=True) differ.parse(u'\x1dĔBϞɛˁşʑ˳˻ȣſéÎ\x90̕ȟòwʘ\x1dĔBϞɛˁşʑ˳˻ȣſéÎ', parsers=1,
expect_error_leaves=True)
differ.parse(u'\r\r', parsers=1) differ.parse(u'\r\r', parsers=1)
differ.parse(u"˟Ę\x05À\r rúƣ@\x8a\x15r()\n", parsers=1, expect_error_leaves=True) differ.parse(u"˟Ę\x05À\r rúƣ@\x8a\x15r()\n", parsers=1, expect_error_leaves=True)
differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1) differ.parse(u'a\ntaǁ\rGĒōns__\n\nb', parsers=1,
expect_error_leaves=sys.version_info[0] == 2)
s = ' if not (self, "_fi\x02\x0e\x08\n\nle"):' s = ' if not (self, "_fi\x02\x0e\x08\n\nle"):'
differ.parse(s, parsers=1, expect_error_leaves=True) differ.parse(s, parsers=1, expect_error_leaves=True)
differ.parse('') differ.parse('')

View File

@@ -142,7 +142,7 @@ def test_identifier_contains_unicode():
else: else:
# Unicode tokens in Python 2 seem to be identified as operators. # Unicode tokens in Python 2 seem to be identified as operators.
# They will be ignored in the parser, that's ok. # They will be ignored in the parser, that's ok.
assert unicode_token[0] == OP assert unicode_token[0] == ERRORTOKEN
def test_quoted_strings(): def test_quoted_strings():