mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-07 05:14:29 +08:00
Fix tokenizer for random invalid unicode points
This commit is contained in:
@@ -496,33 +496,25 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
pseudomatch = pseudo_token.match(string_line, pos)
|
pseudomatch = pseudo_token.match(string_line, pos)
|
||||||
else:
|
else:
|
||||||
pseudomatch = pseudo_token.match(line, pos)
|
pseudomatch = pseudo_token.match(line, pos)
|
||||||
if not pseudomatch: # scan for tokens
|
|
||||||
match = whitespace.match(line, pos)
|
|
||||||
if pos == 0 and paren_level == 0:
|
|
||||||
for t in dedent_if_necessary(match.end()):
|
|
||||||
yield t
|
|
||||||
pos = match.end()
|
|
||||||
new_line = False
|
|
||||||
yield PythonToken(
|
|
||||||
ERRORTOKEN, line[pos], (lnum, pos),
|
|
||||||
additional_prefix + match.group(0)
|
|
||||||
)
|
|
||||||
additional_prefix = ''
|
|
||||||
pos += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
prefix = additional_prefix + pseudomatch.group(1)
|
if pseudomatch:
|
||||||
additional_prefix = ''
|
prefix = additional_prefix + pseudomatch.group(1)
|
||||||
start, pos = pseudomatch.span(2)
|
additional_prefix = ''
|
||||||
spos = (lnum, start)
|
start, pos = pseudomatch.span(2)
|
||||||
token = pseudomatch.group(2)
|
spos = (lnum, start)
|
||||||
if token == '':
|
token = pseudomatch.group(2)
|
||||||
assert prefix
|
if token == '':
|
||||||
additional_prefix = prefix
|
assert prefix
|
||||||
# This means that we have a line with whitespace/comments at
|
additional_prefix = prefix
|
||||||
# the end, which just results in an endmarker.
|
# This means that we have a line with whitespace/comments at
|
||||||
break
|
# the end, which just results in an endmarker.
|
||||||
initial = token[0]
|
break
|
||||||
|
initial = token[0]
|
||||||
|
else:
|
||||||
|
match = whitespace.match(line, pos)
|
||||||
|
initial = line[match.end()]
|
||||||
|
start = match.end()
|
||||||
|
spos = (lnum, start)
|
||||||
|
|
||||||
if new_line and initial not in '\r\n\\#':
|
if new_line and initial not in '\r\n\\#':
|
||||||
new_line = False
|
new_line = False
|
||||||
@@ -539,8 +531,23 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)):
|
|||||||
for t in dedent_if_necessary(indent_start):
|
for t in dedent_if_necessary(indent_start):
|
||||||
yield t
|
yield t
|
||||||
|
|
||||||
if (initial in numchars or # ordinary number
|
if not pseudomatch: # scan for tokens
|
||||||
(initial == '.' and token != '.' and token != '...')):
|
match = whitespace.match(line, pos)
|
||||||
|
if pos == 0 and paren_level == 0:
|
||||||
|
for t in dedent_if_necessary(match.end()):
|
||||||
|
yield t
|
||||||
|
pos = match.end()
|
||||||
|
new_line = False
|
||||||
|
yield PythonToken(
|
||||||
|
ERRORTOKEN, line[pos], (lnum, pos),
|
||||||
|
additional_prefix + match.group(0)
|
||||||
|
)
|
||||||
|
additional_prefix = ''
|
||||||
|
pos += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (initial in numchars # ordinary number
|
||||||
|
or (initial == '.' and token != '.' and token != '...')):
|
||||||
yield PythonToken(NUMBER, token, spos, prefix)
|
yield PythonToken(NUMBER, token, spos, prefix)
|
||||||
elif pseudomatch.group(3) is not None: # ordinary name
|
elif pseudomatch.group(3) is not None: # ordinary name
|
||||||
if token in always_break_tokens:
|
if token in always_break_tokens:
|
||||||
|
|||||||
@@ -258,7 +258,7 @@ def test_token_types(code, types):
|
|||||||
|
|
||||||
|
|
||||||
def test_error_string():
|
def test_error_string():
|
||||||
t1, newline, endmarker = _get_token_list(' "\n')
|
indent, t1, newline, token, endmarker = _get_token_list(' "\n')
|
||||||
assert t1.type == ERRORTOKEN
|
assert t1.type == ERRORTOKEN
|
||||||
assert t1.prefix == ' '
|
assert t1.prefix == ' '
|
||||||
assert t1.string == '"'
|
assert t1.string == '"'
|
||||||
@@ -339,6 +339,8 @@ def test_backslash():
|
|||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
('code', 'types'), [
|
('code', 'types'), [
|
||||||
|
(' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]),
|
||||||
|
# f-strings
|
||||||
('f"', [FSTRING_START]),
|
('f"', [FSTRING_START]),
|
||||||
('f""', [FSTRING_START, FSTRING_END]),
|
('f""', [FSTRING_START, FSTRING_END]),
|
||||||
('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
|
('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
|
||||||
@@ -394,7 +396,7 @@ def test_backslash():
|
|||||||
]),
|
]),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_fstring(code, types, version_ge_py36):
|
def test_token_types(code, types, version_ge_py36):
|
||||||
actual_types = [t.type for t in _get_token_list(code, version_ge_py36)]
|
actual_types = [t.type for t in _get_token_list(code, version_ge_py36)]
|
||||||
assert types + [ENDMARKER] == actual_types
|
assert types + [ENDMARKER] == actual_types
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user