Fix tokenizer for random invalid unicode points

This commit is contained in:
Dave Halter
2020-03-28 21:01:57 +01:00
parent 38b7763e9a
commit a950b82066
2 changed files with 39 additions and 30 deletions

View File

@@ -496,33 +496,25 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
pseudomatch = pseudo_token.match(string_line, pos) pseudomatch = pseudo_token.match(string_line, pos)
else: else:
pseudomatch = pseudo_token.match(line, pos) pseudomatch = pseudo_token.match(line, pos)
if not pseudomatch: # scan for tokens
match = whitespace.match(line, pos)
if pos == 0 and paren_level == 0:
for t in dedent_if_necessary(match.end()):
yield t
pos = match.end()
new_line = False
yield PythonToken(
ERRORTOKEN, line[pos], (lnum, pos),
additional_prefix + match.group(0)
)
additional_prefix = ''
pos += 1
continue
prefix = additional_prefix + pseudomatch.group(1) if pseudomatch:
additional_prefix = '' prefix = additional_prefix + pseudomatch.group(1)
start, pos = pseudomatch.span(2) additional_prefix = ''
spos = (lnum, start) start, pos = pseudomatch.span(2)
token = pseudomatch.group(2) spos = (lnum, start)
if token == '': token = pseudomatch.group(2)
assert prefix if token == '':
additional_prefix = prefix assert prefix
# This means that we have a line with whitespace/comments at additional_prefix = prefix
# the end, which just results in an endmarker. # This means that we have a line with whitespace/comments at
break # the end, which just results in an endmarker.
initial = token[0] break
initial = token[0]
else:
match = whitespace.match(line, pos)
initial = line[match.end()]
start = match.end()
spos = (lnum, start)
if new_line and initial not in '\r\n\\#': if new_line and initial not in '\r\n\\#':
new_line = False new_line = False
@@ -539,8 +531,23 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
for t in dedent_if_necessary(indent_start): for t in dedent_if_necessary(indent_start):
yield t yield t
if (initial in numchars or # ordinary number if not pseudomatch: # scan for tokens
(initial == '.' and token != '.' and token != '...')): match = whitespace.match(line, pos)
if pos == 0 and paren_level == 0:
for t in dedent_if_necessary(match.end()):
yield t
pos = match.end()
new_line = False
yield PythonToken(
ERRORTOKEN, line[pos], (lnum, pos),
additional_prefix + match.group(0)
)
additional_prefix = ''
pos += 1
continue
if (initial in numchars # ordinary number
or (initial == '.' and token != '.' and token != '...')):
yield PythonToken(NUMBER, token, spos, prefix) yield PythonToken(NUMBER, token, spos, prefix)
elif pseudomatch.group(3) is not None: # ordinary name elif pseudomatch.group(3) is not None: # ordinary name
if token in always_break_tokens: if token in always_break_tokens:

View File

@@ -258,7 +258,7 @@ def test_token_types(code, types):
def test_error_string(): def test_error_string():
t1, newline, endmarker = _get_token_list(' "\n') indent, t1, newline, token, endmarker = _get_token_list(' "\n')
assert t1.type == ERRORTOKEN assert t1.type == ERRORTOKEN
assert t1.prefix == ' ' assert t1.prefix == ' '
assert t1.string == '"' assert t1.string == '"'
@@ -339,6 +339,8 @@ def test_backslash():
@pytest.mark.parametrize( @pytest.mark.parametrize(
('code', 'types'), [ ('code', 'types'), [
(' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]),
# f-strings
('f"', [FSTRING_START]), ('f"', [FSTRING_START]),
('f""', [FSTRING_START, FSTRING_END]), ('f""', [FSTRING_START, FSTRING_END]),
('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]), ('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
@@ -394,7 +396,7 @@ def test_backslash():
]), ]),
] ]
) )
def test_fstring(code, types, version_ge_py36): def test_token_types(code, types, version_ge_py36):
actual_types = [t.type for t in _get_token_list(code, version_ge_py36)] actual_types = [t.type for t in _get_token_list(code, version_ge_py36)]
assert types + [ENDMARKER] == actual_types assert types + [ENDMARKER] == actual_types