parso/test/test_tokenize.py

# -*- coding: utf-8    # This file contains Unicode characters.

import sys
from textwrap import dedent

import pytest

from parso.utils import split_lines, parse_version_string
from parso.python.token import PythonTokenTypes
from parso.python import tokenize
from parso import parse
from parso.python.tokenize import PythonToken


# To make it easier to access some of the token types, just put them here.
NAME = PythonTokenTypes.NAME
NEWLINE = PythonTokenTypes.NEWLINE
STRING = PythonTokenTypes.STRING
NUMBER = PythonTokenTypes.NUMBER
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
OP = PythonTokenTypes.OP
ENDMARKER = PythonTokenTypes.ENDMARKER
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
FSTRING_START = PythonTokenTypes.FSTRING_START
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
FSTRING_END = PythonTokenTypes.FSTRING_END


def _get_token_list(string, version=None):
    # Load the current version.
    version_info = parse_version_string(version)
    return list(tokenize.tokenize(string, version_info))


def test_end_pos_one_line():
    parsed = parse(dedent('''
    def testit():
        a = "huhu"
    '''))
    simple_stmt = next(parsed.iter_funcdefs()).get_suite().children[-1]
    string = simple_stmt.children[0].get_rhs()
    assert string.end_pos == (3, 14)


def test_end_pos_multi_line():
    parsed = parse(dedent('''
    def testit():
        a = """huhu
    asdfasdf""" + "h"
    '''))
    expr_stmt = next(parsed.iter_funcdefs()).get_suite().children[1].children[0]
    string_leaf = expr_stmt.get_rhs().children[0]
    assert string_leaf.end_pos == (4, 11)


def test_simple_no_whitespace():
    # Test a simple one line string, no preceding whitespace
    simple_docstring = '"""simple one line docstring"""'
    token_list = _get_token_list(simple_docstring)
    _, value, _, prefix = token_list[0]
    assert prefix == ''
    assert value == '"""simple one line docstring"""'


def test_simple_with_whitespace():
    # Test a simple one line string with preceding whitespace and newline
    simple_docstring = '  """simple one line docstring""" \r\n'
    token_list = _get_token_list(simple_docstring)
    assert token_list[0][0] == INDENT
    typ, value, start_pos, prefix = token_list[1]
    assert prefix == '  '
    assert value == '"""simple one line docstring"""'
    assert typ == STRING
    typ, value, start_pos, prefix = token_list[2]
    assert prefix == ' '
    assert typ == NEWLINE


def test_function_whitespace():
    # Test function definition whitespace identification
    fundef = dedent('''
    def test_whitespace(*args, **kwargs):
        x = 1
        if x > 0:
            print(True)
    ''')
    token_list = _get_token_list(fundef)
    for _, value, _, prefix in token_list:
        if value == 'test_whitespace':
            assert prefix == ' '
        if value == '(':
            assert prefix == ''
        if value == '*':
            assert prefix == ''
        if value == '**':
            assert prefix == ' '
        if value == 'print':
            assert prefix == '        '
        if value == 'if':
            assert prefix == '    '


def test_tokenize_multiline_I():
    # Make sure multiline string having newlines have the end marker on the
    # next line
    fundef = '''""""\n'''
    token_list = _get_token_list(fundef)
    assert token_list == [PythonToken(ERRORTOKEN, '""""\n', (1, 0), ''),
                          PythonToken(ENDMARKER ,       '', (2, 0), '')]


def test_tokenize_multiline_II():
    # Make sure multiline string having no newlines have the end marker on
    # same line
    fundef = '''""""'''
    token_list = _get_token_list(fundef)
    assert token_list == [PythonToken(ERRORTOKEN, '""""', (1, 0), ''),
                          PythonToken(ENDMARKER,      '', (1, 4), '')]


def test_tokenize_multiline_III():
    # Make sure multiline string having newlines have the end marker on the
    # next line even if several newline
    fundef = '''""""\n\n'''
    token_list = _get_token_list(fundef)
    assert token_list == [PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ''),
                          PythonToken(ENDMARKER,          '', (3, 0), '')]


def test_identifier_contains_unicode():
    fundef = dedent('''
    def 我あφ():
        pass
    ''')
    token_list = _get_token_list(fundef)
    unicode_token = token_list[1]
    if sys.version_info.major >= 3:
        assert unicode_token[0] == NAME
    else:
        # Unicode tokens in Python 2 seem to be identified as operators.
        # They will be ignored in the parser, that's ok.
        assert unicode_token[0] == ERRORTOKEN


def test_quoted_strings():
    string_tokens = [
        'u"test"',
        'u"""test"""',
        'U"""test"""',
        "u'''test'''",
        "U'''test'''",
    ]

    for s in string_tokens:
        module = parse('''a = %s\n''' % s)
        simple_stmt = module.children[0]
        expr_stmt = simple_stmt.children[0]
        assert len(expr_stmt.children) == 3
        string_tok = expr_stmt.children[2]
        assert string_tok.type == 'string'
        assert string_tok.value == s


def test_ur_literals():
    """
    Decided to parse `u''` literals regardless of Python version. This makes
    probably sense:

    - Python 3+ doesn't support it, but it doesn't hurt
      not be. While this is incorrect, it's just incorrect for one "old" and in
      the future not very important version.
    - All the other Python versions work very well with it.
    """
    def check(literal, is_literal=True):
        token_list = _get_token_list(literal)
        typ, result_literal, _, _ = token_list[0]
        if is_literal:
            if typ != FSTRING_START:
                assert typ == STRING
                assert result_literal == literal
        else:
            assert typ == NAME

    check('u""')
    check('ur""', is_literal=not sys.version_info.major >= 3)
    check('Ur""', is_literal=not sys.version_info.major >= 3)
    check('UR""', is_literal=not sys.version_info.major >= 3)
    check('bR""')
    # Starting with Python 3.3 this ordering is also possible.
    if sys.version_info.major >= 3:
        check('Rb""')

    # Starting with Python 3.6 format strings where introduced.
    check('fr""', is_literal=sys.version_info >= (3, 6))
    check('rF""', is_literal=sys.version_info >= (3, 6))
    check('f""', is_literal=sys.version_info >= (3, 6))
    check('F""', is_literal=sys.version_info >= (3, 6))


def test_error_literal():
    error_token, newline, endmarker = _get_token_list('"\n')
    assert error_token.type == ERRORTOKEN
    assert error_token.string == '"'
    assert newline.type == NEWLINE
    assert endmarker.type == ENDMARKER
    assert endmarker.prefix == ''

    bracket, error_token, endmarker = _get_token_list('( """')
    assert error_token.type == ERRORTOKEN
    assert error_token.prefix == ' '
    assert error_token.string == '"""'
    assert endmarker.type == ENDMARKER
    assert endmarker.prefix == ''


def test_endmarker_end_pos():
    def check(code):
        tokens = _get_token_list(code)
        lines = split_lines(code)
        assert tokens[-1].end_pos == (len(lines), len(lines[-1]))

    check('#c')
    check('#c\n')
    check('a\n')
    check('a')
    check(r'a\\n')
    check('a\\')


xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])


@pytest.mark.parametrize(
    ('code', 'types'), [
        # Indentation
        (' foo', [INDENT, NAME, DEDENT]),
        ('  foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        ('  foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
                                NEWLINE, NAME, DEDENT]),
        (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),

        # Name stuff
        ('1foo1', [NUMBER, NAME]),
        pytest.param(
            u'மெல்லினம்', [NAME],
            **xfail_py2),
        pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
        pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
        pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
        (' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]),
        (dedent('''\
            class BaseCache:
                    a
                def
                    b
                def
                    c
            '''), [NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE,
                   ERROR_DEDENT, NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
                   NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT, DEDENT]),
        ('  )\n foo', [INDENT, OP, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        ('a\n b\n  )\n c', [NAME, NEWLINE, INDENT, NAME, NEWLINE, INDENT, OP,
                            NEWLINE, DEDENT, NAME, DEDENT]),
    ]
)
def test_token_types(code, types):
    actual_types = [t.type for t in _get_token_list(code)]
    assert actual_types == types + [ENDMARKER]


def test_error_string():
    indent, t1, newline, token, endmarker = _get_token_list(' "\n')
    assert t1.type == ERRORTOKEN
    assert t1.prefix == ' '
    assert t1.string == '"'
    assert newline.type == NEWLINE
    assert endmarker.prefix == ''
    assert endmarker.string == ''


def test_indent_error_recovery():
    code = dedent("""\
                        str(
        from x import a
        def
        """)
    lst = _get_token_list(code)
    expected = [
        # `str(`
        INDENT, NAME, OP,
        # `from parso`
        NAME, NAME,
        # `import a` on same line as the previous from parso
        NAME, NAME, NEWLINE,
        # Dedent happens, because there's an import now and the import
        # statement "breaks" out of the opening paren on the first line.
        DEDENT,
        # `b`
        NAME, NEWLINE, ENDMARKER]
    assert [t.type for t in lst] == expected


def test_error_token_after_dedent():
    code = dedent("""\
        class C:
            pass
        $foo
        """)
    lst = _get_token_list(code)
    expected = [
        NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
        # $foo\n
        ERRORTOKEN, NAME, NEWLINE, ENDMARKER
    ]
    assert [t.type for t in lst] == expected


def test_brackets_no_indentation():
    """
    There used to be an issue that the parentheses counting would go below
    zero. This should not happen.
    """
    code = dedent("""\
        }
        {
          }
        """)
    lst = _get_token_list(code)
    assert [t.type for t in lst] == [OP, NEWLINE, OP, OP, NEWLINE, ENDMARKER]


def test_form_feed():
    indent, error_token, dedent_, endmarker = _get_token_list(dedent('''\
        \f"""'''))
    assert error_token.prefix == '\f'
    assert error_token.string == '"""'
    assert endmarker.prefix == ''
    assert indent.type == INDENT
    assert dedent_.type == DEDENT


def test_carriage_return():
    lst = _get_token_list(' =\\\rclass')
    assert [t.type for t in lst] == [INDENT, OP, DEDENT, NAME, ENDMARKER]


def test_backslash():
    code = '\\\n# 1 \n'
    endmarker, = _get_token_list(code)
    assert endmarker.prefix == code


@pytest.mark.parametrize(
    ('code', 'types'), [
        # f-strings
        ('f"', [FSTRING_START]),
        ('f""', [FSTRING_START, FSTRING_END]),
        ('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
        ('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP]),
        (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
        (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),

        # format spec
        (r'f"Some {x:.2f}{y}"', [FSTRING_START, FSTRING_STRING, OP, NAME, OP,
                                 FSTRING_STRING, OP, OP, NAME, OP, FSTRING_END]),

        # multiline f-string
        ('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
        ('f"""abc{\n123}def"""', [
            FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
            FSTRING_END
        ]),

        # a line continuation inside of an fstring_string
        ('f"abc\\\ndef"', [
            FSTRING_START, FSTRING_STRING, FSTRING_END
        ]),
        ('f"\\\n{123}\\\n"', [
            FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
            FSTRING_END
        ]),

        # a line continuation inside of an fstring_expr
        ('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END]),

        # a line continuation inside of an format spec
        ('f"{123:.2\\\nf}"', [
            FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END
        ]),

        # a newline without a line continuation inside a single-line string is
        # wrong, and will generate an ERRORTOKEN
        ('f"abc\ndef"', [
            FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN
        ]),

        # a more complex example
        (r'print(f"Some {x:.2f}a{y}")', [
            NAME, OP, FSTRING_START, FSTRING_STRING, OP, NAME, OP,
            FSTRING_STRING, OP, FSTRING_STRING, OP, NAME, OP, FSTRING_END, OP
        ]),
        # issue #86, a string-like in an f-string expression
        ('f"{ ""}"', [
            FSTRING_START, OP, FSTRING_END, STRING
        ]),
        ('f"{ f""}"', [
            FSTRING_START, OP, NAME, FSTRING_END, STRING
        ]),
    ]
)
def test_fstring_token_types(code, types, version_ge_py36):
    actual_types = [t.type for t in _get_token_list(code, version_ge_py36)]
    assert types + [ENDMARKER] == actual_types


@pytest.mark.parametrize(
    ('code', 'types'), [
        # issue #87, `:=` in the outest paratheses should be tokenized
        # as a format spec marker and part of the format
        ('f"{x:=10}"', [
            FSTRING_START, OP, NAME, OP, FSTRING_STRING, OP, FSTRING_END
        ]),
        ('f"{(x:=10)}"', [
            FSTRING_START, OP, OP, NAME, OP, NUMBER, OP, OP, FSTRING_END
        ]),
    ]
)
def test_fstring_assignment_expression(code, types, version_ge_py38):
    actual_types = [t.type for t in _get_token_list(code, version_ge_py38)]
    assert types + [ENDMARKER] == actual_types


def test_fstring_end_error_pos(version_ge_py38):
    f_start, f_string, bracket, f_end, endmarker = \
        _get_token_list('f" { "', version_ge_py38)
    assert f_start.start_pos == (1, 0)
    assert f_string.start_pos == (1, 2)
    assert bracket.start_pos == (1, 3)
    assert f_end.start_pos == (1, 5)
    assert endmarker.start_pos == (1, 6)