mirror of
https://github.com/davidhalter/parso.git
synced 2025-12-07 05:14:29 +08:00
This is a very intentional change. Previously form feeds were handled very poorly and sometimes where not counted as indentation. This obviously makes sense. But at the same time indentation is very tricky to deal with (both for editors and parso). Especially in the diff parser this led to a lot of very weird issues. The decision probably makes sense since: 1. Almost nobody uses form feeds in the first place. 2. People that use form feeds like Barry Warsaw often put a newline ater them. (e.g Python's email.__init__) 3. If you write an editor you want to be able to identify a unicode character with a clear line/column. This would not be the case if form feeds were just ignored when counting. Form feeds will still work in Jedi, will not cause parse errors and in general you should be fine using them. It might just cause Jedi to count them as indentation **if** you use it like '\f foo()'. This is however confusing for most editors anyway. It leads to a weird display e.g. in VIM, even if it's perfectly valid code in Python. Since parso is a code analysis parser and not the languages parser I think it's fine to ignore this edge case.
443 lines
14 KiB
Python
443 lines
14 KiB
Python
# -*- coding: utf-8 # This file contains Unicode characters.
|
|
|
|
import sys
|
|
from textwrap import dedent
|
|
|
|
import pytest
|
|
|
|
from parso.utils import split_lines, parse_version_string
|
|
from parso.python.token import PythonTokenTypes
|
|
from parso.python import tokenize
|
|
from parso import parse
|
|
from parso.python.tokenize import PythonToken
|
|
|
|
|
|
# To make it easier to access some of the token types, just put them here.
|
|
NAME = PythonTokenTypes.NAME
|
|
NEWLINE = PythonTokenTypes.NEWLINE
|
|
STRING = PythonTokenTypes.STRING
|
|
NUMBER = PythonTokenTypes.NUMBER
|
|
INDENT = PythonTokenTypes.INDENT
|
|
DEDENT = PythonTokenTypes.DEDENT
|
|
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
|
|
OP = PythonTokenTypes.OP
|
|
ENDMARKER = PythonTokenTypes.ENDMARKER
|
|
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
|
|
FSTRING_START = PythonTokenTypes.FSTRING_START
|
|
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
|
|
FSTRING_END = PythonTokenTypes.FSTRING_END
|
|
|
|
|
|
def _get_token_list(string, version=None):
|
|
# Load the current version.
|
|
version_info = parse_version_string(version)
|
|
return list(tokenize.tokenize(string, version_info))
|
|
|
|
|
|
def test_end_pos_one_line():
|
|
parsed = parse(dedent('''
|
|
def testit():
|
|
a = "huhu"
|
|
'''))
|
|
simple_stmt = next(parsed.iter_funcdefs()).get_suite().children[-1]
|
|
string = simple_stmt.children[0].get_rhs()
|
|
assert string.end_pos == (3, 14)
|
|
|
|
|
|
def test_end_pos_multi_line():
|
|
parsed = parse(dedent('''
|
|
def testit():
|
|
a = """huhu
|
|
asdfasdf""" + "h"
|
|
'''))
|
|
expr_stmt = next(parsed.iter_funcdefs()).get_suite().children[1].children[0]
|
|
string_leaf = expr_stmt.get_rhs().children[0]
|
|
assert string_leaf.end_pos == (4, 11)
|
|
|
|
|
|
def test_simple_no_whitespace():
|
|
# Test a simple one line string, no preceding whitespace
|
|
simple_docstring = '"""simple one line docstring"""'
|
|
token_list = _get_token_list(simple_docstring)
|
|
_, value, _, prefix = token_list[0]
|
|
assert prefix == ''
|
|
assert value == '"""simple one line docstring"""'
|
|
|
|
|
|
def test_simple_with_whitespace():
|
|
# Test a simple one line string with preceding whitespace and newline
|
|
simple_docstring = ' """simple one line docstring""" \r\n'
|
|
token_list = _get_token_list(simple_docstring)
|
|
assert token_list[0][0] == INDENT
|
|
typ, value, start_pos, prefix = token_list[1]
|
|
assert prefix == ' '
|
|
assert value == '"""simple one line docstring"""'
|
|
assert typ == STRING
|
|
typ, value, start_pos, prefix = token_list[2]
|
|
assert prefix == ' '
|
|
assert typ == NEWLINE
|
|
|
|
|
|
def test_function_whitespace():
|
|
# Test function definition whitespace identification
|
|
fundef = dedent('''
|
|
def test_whitespace(*args, **kwargs):
|
|
x = 1
|
|
if x > 0:
|
|
print(True)
|
|
''')
|
|
token_list = _get_token_list(fundef)
|
|
for _, value, _, prefix in token_list:
|
|
if value == 'test_whitespace':
|
|
assert prefix == ' '
|
|
if value == '(':
|
|
assert prefix == ''
|
|
if value == '*':
|
|
assert prefix == ''
|
|
if value == '**':
|
|
assert prefix == ' '
|
|
if value == 'print':
|
|
assert prefix == ' '
|
|
if value == 'if':
|
|
assert prefix == ' '
|
|
|
|
|
|
def test_tokenize_multiline_I():
|
|
# Make sure multiline string having newlines have the end marker on the
|
|
# next line
|
|
fundef = '''""""\n'''
|
|
token_list = _get_token_list(fundef)
|
|
assert token_list == [PythonToken(ERRORTOKEN, '""""\n', (1, 0), ''),
|
|
PythonToken(ENDMARKER , '', (2, 0), '')]
|
|
|
|
|
|
def test_tokenize_multiline_II():
|
|
# Make sure multiline string having no newlines have the end marker on
|
|
# same line
|
|
fundef = '''""""'''
|
|
token_list = _get_token_list(fundef)
|
|
assert token_list == [PythonToken(ERRORTOKEN, '""""', (1, 0), ''),
|
|
PythonToken(ENDMARKER, '', (1, 4), '')]
|
|
|
|
|
|
def test_tokenize_multiline_III():
|
|
# Make sure multiline string having newlines have the end marker on the
|
|
# next line even if several newline
|
|
fundef = '''""""\n\n'''
|
|
token_list = _get_token_list(fundef)
|
|
assert token_list == [PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ''),
|
|
PythonToken(ENDMARKER, '', (3, 0), '')]
|
|
|
|
|
|
def test_identifier_contains_unicode():
|
|
fundef = dedent('''
|
|
def 我あφ():
|
|
pass
|
|
''')
|
|
token_list = _get_token_list(fundef)
|
|
unicode_token = token_list[1]
|
|
if sys.version_info.major >= 3:
|
|
assert unicode_token[0] == NAME
|
|
else:
|
|
# Unicode tokens in Python 2 seem to be identified as operators.
|
|
# They will be ignored in the parser, that's ok.
|
|
assert unicode_token[0] == ERRORTOKEN
|
|
|
|
|
|
def test_quoted_strings():
|
|
string_tokens = [
|
|
'u"test"',
|
|
'u"""test"""',
|
|
'U"""test"""',
|
|
"u'''test'''",
|
|
"U'''test'''",
|
|
]
|
|
|
|
for s in string_tokens:
|
|
module = parse('''a = %s\n''' % s)
|
|
simple_stmt = module.children[0]
|
|
expr_stmt = simple_stmt.children[0]
|
|
assert len(expr_stmt.children) == 3
|
|
string_tok = expr_stmt.children[2]
|
|
assert string_tok.type == 'string'
|
|
assert string_tok.value == s
|
|
|
|
|
|
def test_ur_literals():
|
|
"""
|
|
Decided to parse `u''` literals regardless of Python version. This makes
|
|
probably sense:
|
|
|
|
- Python 3+ doesn't support it, but it doesn't hurt
|
|
not be. While this is incorrect, it's just incorrect for one "old" and in
|
|
the future not very important version.
|
|
- All the other Python versions work very well with it.
|
|
"""
|
|
def check(literal, is_literal=True):
|
|
token_list = _get_token_list(literal)
|
|
typ, result_literal, _, _ = token_list[0]
|
|
if is_literal:
|
|
if typ != FSTRING_START:
|
|
assert typ == STRING
|
|
assert result_literal == literal
|
|
else:
|
|
assert typ == NAME
|
|
|
|
check('u""')
|
|
check('ur""', is_literal=not sys.version_info.major >= 3)
|
|
check('Ur""', is_literal=not sys.version_info.major >= 3)
|
|
check('UR""', is_literal=not sys.version_info.major >= 3)
|
|
check('bR""')
|
|
# Starting with Python 3.3 this ordering is also possible.
|
|
if sys.version_info.major >= 3:
|
|
check('Rb""')
|
|
|
|
# Starting with Python 3.6 format strings where introduced.
|
|
check('fr""', is_literal=sys.version_info >= (3, 6))
|
|
check('rF""', is_literal=sys.version_info >= (3, 6))
|
|
check('f""', is_literal=sys.version_info >= (3, 6))
|
|
check('F""', is_literal=sys.version_info >= (3, 6))
|
|
|
|
|
|
def test_error_literal():
|
|
error_token, newline, endmarker = _get_token_list('"\n')
|
|
assert error_token.type == ERRORTOKEN
|
|
assert error_token.string == '"'
|
|
assert newline.type == NEWLINE
|
|
assert endmarker.type == ENDMARKER
|
|
assert endmarker.prefix == ''
|
|
|
|
bracket, error_token, endmarker = _get_token_list('( """')
|
|
assert error_token.type == ERRORTOKEN
|
|
assert error_token.prefix == ' '
|
|
assert error_token.string == '"""'
|
|
assert endmarker.type == ENDMARKER
|
|
assert endmarker.prefix == ''
|
|
|
|
|
|
def test_endmarker_end_pos():
|
|
def check(code):
|
|
tokens = _get_token_list(code)
|
|
lines = split_lines(code)
|
|
assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
|
|
|
|
check('#c')
|
|
check('#c\n')
|
|
check('a\n')
|
|
check('a')
|
|
check(r'a\\n')
|
|
check('a\\')
|
|
|
|
|
|
xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
('code', 'types'), [
|
|
# Indentation
|
|
(' foo', [INDENT, NAME, DEDENT]),
|
|
(' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
|
|
(' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
|
|
NEWLINE, NAME, DEDENT]),
|
|
(' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
|
|
|
|
# Name stuff
|
|
('1foo1', [NUMBER, NAME]),
|
|
pytest.param(
|
|
u'மெல்லினம்', [NAME],
|
|
**xfail_py2),
|
|
pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
|
|
pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
|
|
pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
|
|
(' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]),
|
|
(dedent('''\
|
|
class BaseCache:
|
|
a
|
|
def
|
|
b
|
|
def
|
|
c
|
|
'''), [NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE,
|
|
ERROR_DEDENT, NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
|
|
NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT, DEDENT]),
|
|
(' )\n foo', [INDENT, OP, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
|
|
('a\n b\n )\n c', [NAME, NEWLINE, INDENT, NAME, NEWLINE, INDENT, OP,
|
|
NEWLINE, DEDENT, NAME, DEDENT]),
|
|
]
|
|
)
|
|
def test_token_types(code, types):
|
|
actual_types = [t.type for t in _get_token_list(code)]
|
|
assert actual_types == types + [ENDMARKER]
|
|
|
|
|
|
def test_error_string():
|
|
indent, t1, newline, token, endmarker = _get_token_list(' "\n')
|
|
assert t1.type == ERRORTOKEN
|
|
assert t1.prefix == ' '
|
|
assert t1.string == '"'
|
|
assert newline.type == NEWLINE
|
|
assert endmarker.prefix == ''
|
|
assert endmarker.string == ''
|
|
|
|
|
|
def test_indent_error_recovery():
|
|
code = dedent("""\
|
|
str(
|
|
from x import a
|
|
def
|
|
""")
|
|
lst = _get_token_list(code)
|
|
expected = [
|
|
# `str(`
|
|
INDENT, NAME, OP,
|
|
# `from parso`
|
|
NAME, NAME,
|
|
# `import a` on same line as the previous from parso
|
|
NAME, NAME, NEWLINE,
|
|
# Dedent happens, because there's an import now and the import
|
|
# statement "breaks" out of the opening paren on the first line.
|
|
DEDENT,
|
|
# `b`
|
|
NAME, NEWLINE, ENDMARKER]
|
|
assert [t.type for t in lst] == expected
|
|
|
|
|
|
def test_error_token_after_dedent():
|
|
code = dedent("""\
|
|
class C:
|
|
pass
|
|
$foo
|
|
""")
|
|
lst = _get_token_list(code)
|
|
expected = [
|
|
NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
|
|
# $foo\n
|
|
ERRORTOKEN, NAME, NEWLINE, ENDMARKER
|
|
]
|
|
assert [t.type for t in lst] == expected
|
|
|
|
|
|
def test_brackets_no_indentation():
|
|
"""
|
|
There used to be an issue that the parentheses counting would go below
|
|
zero. This should not happen.
|
|
"""
|
|
code = dedent("""\
|
|
}
|
|
{
|
|
}
|
|
""")
|
|
lst = _get_token_list(code)
|
|
assert [t.type for t in lst] == [OP, NEWLINE, OP, OP, NEWLINE, ENDMARKER]
|
|
|
|
|
|
def test_form_feed():
|
|
indent, error_token, dedent_, endmarker = _get_token_list(dedent('''\
|
|
\f"""'''))
|
|
assert error_token.prefix == '\f'
|
|
assert error_token.string == '"""'
|
|
assert endmarker.prefix == ''
|
|
assert indent.type == INDENT
|
|
assert dedent_.type == DEDENT
|
|
|
|
|
|
def test_carriage_return():
|
|
lst = _get_token_list(' =\\\rclass')
|
|
assert [t.type for t in lst] == [INDENT, OP, DEDENT, NAME, ENDMARKER]
|
|
|
|
|
|
def test_backslash():
|
|
code = '\\\n# 1 \n'
|
|
endmarker, = _get_token_list(code)
|
|
assert endmarker.prefix == code
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
('code', 'types'), [
|
|
# f-strings
|
|
('f"', [FSTRING_START]),
|
|
('f""', [FSTRING_START, FSTRING_END]),
|
|
('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
|
|
('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP]),
|
|
(r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
|
|
(r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
|
|
|
|
# format spec
|
|
(r'f"Some {x:.2f}{y}"', [FSTRING_START, FSTRING_STRING, OP, NAME, OP,
|
|
FSTRING_STRING, OP, OP, NAME, OP, FSTRING_END]),
|
|
|
|
# multiline f-string
|
|
('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
|
|
('f"""abc{\n123}def"""', [
|
|
FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
|
|
FSTRING_END
|
|
]),
|
|
|
|
# a line continuation inside of an fstring_string
|
|
('f"abc\\\ndef"', [
|
|
FSTRING_START, FSTRING_STRING, FSTRING_END
|
|
]),
|
|
('f"\\\n{123}\\\n"', [
|
|
FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
|
|
FSTRING_END
|
|
]),
|
|
|
|
# a line continuation inside of an fstring_expr
|
|
('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END]),
|
|
|
|
# a line continuation inside of an format spec
|
|
('f"{123:.2\\\nf}"', [
|
|
FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END
|
|
]),
|
|
|
|
# a newline without a line continuation inside a single-line string is
|
|
# wrong, and will generate an ERRORTOKEN
|
|
('f"abc\ndef"', [
|
|
FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN
|
|
]),
|
|
|
|
# a more complex example
|
|
(r'print(f"Some {x:.2f}a{y}")', [
|
|
NAME, OP, FSTRING_START, FSTRING_STRING, OP, NAME, OP,
|
|
FSTRING_STRING, OP, FSTRING_STRING, OP, NAME, OP, FSTRING_END, OP
|
|
]),
|
|
# issue #86, a string-like in an f-string expression
|
|
('f"{ ""}"', [
|
|
FSTRING_START, OP, FSTRING_END, STRING
|
|
]),
|
|
('f"{ f""}"', [
|
|
FSTRING_START, OP, NAME, FSTRING_END, STRING
|
|
]),
|
|
]
|
|
)
|
|
def test_fstring_token_types(code, types, version_ge_py36):
|
|
actual_types = [t.type for t in _get_token_list(code, version_ge_py36)]
|
|
assert types + [ENDMARKER] == actual_types
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
('code', 'types'), [
|
|
# issue #87, `:=` in the outest paratheses should be tokenized
|
|
# as a format spec marker and part of the format
|
|
('f"{x:=10}"', [
|
|
FSTRING_START, OP, NAME, OP, FSTRING_STRING, OP, FSTRING_END
|
|
]),
|
|
('f"{(x:=10)}"', [
|
|
FSTRING_START, OP, OP, NAME, OP, NUMBER, OP, OP, FSTRING_END
|
|
]),
|
|
]
|
|
)
|
|
def test_fstring_assignment_expression(code, types, version_ge_py38):
|
|
actual_types = [t.type for t in _get_token_list(code, version_ge_py38)]
|
|
assert types + [ENDMARKER] == actual_types
|
|
|
|
|
|
def test_fstring_end_error_pos(version_ge_py38):
|
|
f_start, f_string, bracket, f_end, endmarker = \
|
|
_get_token_list('f" { "', version_ge_py38)
|
|
assert f_start.start_pos == (1, 0)
|
|
assert f_string.start_pos == (1, 2)
|
|
assert bracket.start_pos == (1, 3)
|
|
assert f_end.start_pos == (1, 5)
|
|
assert endmarker.start_pos == (1, 6)
|