diff --git a/parso/python/tokenize.py b/parso/python/tokenize.py index 3dcb6c4..2f87ae4 100644 --- a/parso/python/tokenize.py +++ b/parso/python/tokenize.py @@ -496,33 +496,25 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): pseudomatch = pseudo_token.match(string_line, pos) else: pseudomatch = pseudo_token.match(line, pos) - if not pseudomatch: # scan for tokens - match = whitespace.match(line, pos) - if pos == 0 and paren_level == 0: - for t in dedent_if_necessary(match.end()): - yield t - pos = match.end() - new_line = False - yield PythonToken( - ERRORTOKEN, line[pos], (lnum, pos), - additional_prefix + match.group(0) - ) - additional_prefix = '' - pos += 1 - continue - prefix = additional_prefix + pseudomatch.group(1) - additional_prefix = '' - start, pos = pseudomatch.span(2) - spos = (lnum, start) - token = pseudomatch.group(2) - if token == '': - assert prefix - additional_prefix = prefix - # This means that we have a line with whitespace/comments at - # the end, which just results in an endmarker. - break - initial = token[0] + if pseudomatch: + prefix = additional_prefix + pseudomatch.group(1) + additional_prefix = '' + start, pos = pseudomatch.span(2) + spos = (lnum, start) + token = pseudomatch.group(2) + if token == '': + assert prefix + additional_prefix = prefix + # This means that we have a line with whitespace/comments at + # the end, which just results in an endmarker. + break + initial = token[0] + else: + match = whitespace.match(line, pos) + initial = line[match.end()] + start = match.end() + spos = (lnum, start) if new_line and initial not in '\r\n\\#': new_line = False @@ -539,8 +531,23 @@ def tokenize_lines(lines, version_info, start_pos=(1, 0)): for t in dedent_if_necessary(indent_start): yield t - if (initial in numchars or # ordinary number - (initial == '.' and token != '.' and token != '...')): + if not pseudomatch: # scan for tokens + match = whitespace.match(line, pos) + if pos == 0 and paren_level == 0: + for t in dedent_if_necessary(match.end()): + yield t + pos = match.end() + new_line = False + yield PythonToken( + ERRORTOKEN, line[pos], (lnum, pos), + additional_prefix + match.group(0) + ) + additional_prefix = '' + pos += 1 + continue + + if (initial in numchars # ordinary number + or (initial == '.' and token != '.' and token != '...')): yield PythonToken(NUMBER, token, spos, prefix) elif pseudomatch.group(3) is not None: # ordinary name if token in always_break_tokens: diff --git a/test/test_tokenize.py b/test/test_tokenize.py index b73ce16..8377cd0 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -258,7 +258,7 @@ def test_token_types(code, types): def test_error_string(): - t1, newline, endmarker = _get_token_list(' "\n') + indent, t1, newline, token, endmarker = _get_token_list(' "\n') assert t1.type == ERRORTOKEN assert t1.prefix == ' ' assert t1.string == '"' @@ -339,6 +339,8 @@ def test_backslash(): @pytest.mark.parametrize( ('code', 'types'), [ + (' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]), + # f-strings ('f"', [FSTRING_START]), ('f""', [FSTRING_START, FSTRING_END]), ('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]), @@ -394,7 +396,7 @@ def test_backslash(): ]), ] ) -def test_fstring(code, types, version_ge_py36): +def test_token_types(code, types, version_ge_py36): actual_types = [t.type for t in _get_token_list(code, version_ge_py36)] assert types + [ENDMARKER] == actual_types