Files
parso/test/test_tokenize.py
2018-04-22 19:28:30 +02:00

238 lines
7.3 KiB
Python

# -*- coding: utf-8 # This file contains Unicode characters.
from textwrap import dedent
import pytest
from parso._compatibility import py_version
from parso.utils import split_lines, parse_version_string
from parso.python.token import (
NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER, ERROR_DEDENT,
FSTRING_START)
from parso.python import tokenize
from parso import parse
from parso.python.tokenize import PythonToken
def _get_token_list(string):
# Load the current version.
version_info = parse_version_string()
return list(tokenize.tokenize(string, version_info))
def test_end_pos_one_line():
parsed = parse(dedent('''
def testit():
a = "huhu"
'''))
simple_stmt = next(parsed.iter_funcdefs()).get_suite().children[-1]
string = simple_stmt.children[0].get_rhs()
assert string.end_pos == (3, 14)
def test_end_pos_multi_line():
parsed = parse(dedent('''
def testit():
a = """huhu
asdfasdf""" + "h"
'''))
expr_stmt = next(parsed.iter_funcdefs()).get_suite().children[1].children[0]
string_leaf = expr_stmt.get_rhs().children[0]
assert string_leaf.end_pos == (4, 11)
def test_simple_no_whitespace():
# Test a simple one line string, no preceding whitespace
simple_docstring = '"""simple one line docstring"""'
token_list = _get_token_list(simple_docstring)
_, value, _, prefix = token_list[0]
assert prefix == ''
assert value == '"""simple one line docstring"""'
def test_simple_with_whitespace():
# Test a simple one line string with preceding whitespace and newline
simple_docstring = ' """simple one line docstring""" \r\n'
token_list = _get_token_list(simple_docstring)
assert token_list[0][0] == INDENT
typ, value, start_pos, prefix = token_list[1]
assert prefix == ' '
assert value == '"""simple one line docstring"""'
assert typ == STRING
typ, value, start_pos, prefix = token_list[2]
assert prefix == ' '
assert typ == NEWLINE
def test_function_whitespace():
# Test function definition whitespace identification
fundef = dedent('''
def test_whitespace(*args, **kwargs):
x = 1
if x > 0:
print(True)
''')
token_list = _get_token_list(fundef)
for _, value, _, prefix in token_list:
if value == 'test_whitespace':
assert prefix == ' '
if value == '(':
assert prefix == ''
if value == '*':
assert prefix == ''
if value == '**':
assert prefix == ' '
if value == 'print':
assert prefix == ' '
if value == 'if':
assert prefix == ' '
def test_tokenize_multiline_I():
# Make sure multiline string having newlines have the end marker on the
# next line
fundef = '''""""\n'''
token_list = _get_token_list(fundef)
assert token_list == [PythonToken(ERRORTOKEN, '""""\n', (1, 0), ''),
PythonToken(ENDMARKER , '', (2, 0), '')]
def test_tokenize_multiline_II():
# Make sure multiline string having no newlines have the end marker on
# same line
fundef = '''""""'''
token_list = _get_token_list(fundef)
assert token_list == [PythonToken(ERRORTOKEN, '""""', (1, 0), ''),
PythonToken(ENDMARKER, '', (1, 4), '')]
def test_tokenize_multiline_III():
# Make sure multiline string having newlines have the end marker on the
# next line even if several newline
fundef = '''""""\n\n'''
token_list = _get_token_list(fundef)
assert token_list == [PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ''),
PythonToken(ENDMARKER, '', (3, 0), '')]
def test_identifier_contains_unicode():
fundef = dedent('''
def 我あφ():
pass
''')
token_list = _get_token_list(fundef)
unicode_token = token_list[1]
if py_version >= 30:
assert unicode_token[0] == NAME
else:
# Unicode tokens in Python 2 seem to be identified as operators.
# They will be ignored in the parser, that's ok.
assert unicode_token[0] == tokenize.ERRORTOKEN
def test_quoted_strings():
string_tokens = [
'u"test"',
'u"""test"""',
'U"""test"""',
"u'''test'''",
"U'''test'''",
]
for s in string_tokens:
module = parse('''a = %s\n''' % s)
simple_stmt = module.children[0]
expr_stmt = simple_stmt.children[0]
assert len(expr_stmt.children) == 3
string_tok = expr_stmt.children[2]
assert string_tok.type == 'string'
assert string_tok.value == s
def test_ur_literals():
"""
Decided to parse `u''` literals regardless of Python version. This makes
probably sense:
- Python 3+ doesn't support it, but it doesn't hurt
not be. While this is incorrect, it's just incorrect for one "old" and in
the future not very important version.
- All the other Python versions work very well with it.
"""
def check(literal, is_literal=True):
token_list = _get_token_list(literal)
typ, result_literal, _, _ = token_list[0]
if is_literal:
if typ != FSTRING_START:
assert typ == STRING
assert result_literal == literal
else:
assert typ == NAME
check('u""')
check('ur""', is_literal=not py_version >= 30)
check('Ur""', is_literal=not py_version >= 30)
check('UR""', is_literal=not py_version >= 30)
check('bR""')
# Starting with Python 3.3 this ordering is also possible.
if py_version >= 33:
check('Rb""')
# Starting with Python 3.6 format strings where introduced.
check('fr""', is_literal=py_version >= 36)
check('rF""', is_literal=py_version >= 36)
check('f""', is_literal=py_version >= 36)
check('F""', is_literal=py_version >= 36)
def test_error_literal():
error_token, endmarker = _get_token_list('"\n')
assert error_token.type == tokenize.ERRORTOKEN
assert endmarker.prefix == ''
assert error_token.string == '"\n'
assert endmarker.type == tokenize.ENDMARKER
assert endmarker.prefix == ''
bracket, error_token, endmarker = _get_token_list('( """')
assert error_token.type == tokenize.ERRORTOKEN
assert error_token.prefix == ' '
assert error_token.string == '"""'
assert endmarker.type == tokenize.ENDMARKER
assert endmarker.prefix == ''
def test_endmarker_end_pos():
def check(code):
tokens = _get_token_list(code)
lines = split_lines(code)
assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
check('#c')
check('#c\n')
check('a\n')
check('a')
check(r'a\\n')
check('a\\')
@pytest.mark.parametrize(
('code', 'types'), [
(' foo', [INDENT, NAME, DEDENT]),
(' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
(' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
(' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
]
)
def test_indentation(code, types):
actual_types = [t.type for t in _get_token_list(code)]
assert actual_types == types + [ENDMARKER]
def test_error_string():
t1, endmarker = _get_token_list(' "\n')
assert t1.type == ERRORTOKEN
assert t1.prefix == ' '
assert t1.string == '"\n'
assert endmarker.string == ''