parso/parso/python/normalizer.py

# -*- coding: utf-8 -*-
import codecs
import warnings
import re
from contextlib import contextmanager

from parso.normalizer import Normalizer, NormalizerConfig, Issue

_BLOCK_STMTS = ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt', 'with_stmt')
_STAR_EXPR_PARENTS = ('testlist_star_expr', 'testlist_comp', 'exprlist')
# This is the maximal block size given by python.
_MAX_BLOCK_SIZE = 20
_MAX_INDENT_COUNT = 100
ALLOWED_FUTURES = (
    'all_feature_names', 'nested_scopes', 'generators', 'division',
    'absolute_import', 'with_statement', 'print_function', 'unicode_literals',
    # TODO make this one optional in lower python versions.
    'generator_stop'
)


def _is_bytes_literal(string):
    return 'b' in string.string_prefix.lower()


def _iter_stmts(scope):
    """
    Iterates over all statements and splits up  simple_stmt.
    """
    for child in scope.children:
        if child.type == 'simple_stmt':
            for child2 in child.children:
                if child2.type == 'newline' or child2 == ';':
                    continue
                yield child2
        else:
            yield child


def _get_comprehension_type(atom):
    first, second = atom.children[:2]
    if second.type == 'testlist_comp' and second.children[1].type == 'comp_for':
        if first == '[':
            return 'list comprehension'
        else:
            return 'generator expression'
    elif second.type == 'dictorsetmaker' and second.children[-1].type == 'comp_for':
        if second.children[1] == ':':
            return 'dict comprehension'
        else:
            return 'set comprehension'
    return None


def _is_future_import(import_from):
    # It looks like a __future__ import that is relative is still a future
    # import. That feels kind of odd, but whatever.
    # if import_from.level != 0:
        # return False
    from_names = import_from.get_from_names()
    return [n.value for n in from_names] == ['__future__']


def _remove_parens(atom):
    """
    Returns the inner part of an expression like `(foo)`. Also removes nested
    parens.
    """
    try:
        children = atom.children
    except AttributeError:
        pass
    else:
        if len(children) == 3 and children[0] == '(':
            return _remove_parens(atom.children[1])
    return atom


def _iter_params(parent_node):
    return (n for n in parent_node.children if n.type == 'param')


def _is_future_import_first(import_from):
    """
    Checks if the import is the first statement of a file.
    """
    found_docstring = False
    for stmt in _iter_stmts(import_from.get_root_node()):
        if stmt.type == 'string' and not found_docstring:
            continue
        found_docstring = True

        if stmt == import_from:
            return True
        if stmt.type == 'import_from' and _is_future_import(stmt):
            continue
        return False


class Context(object):
    def __init__(self, node, add_syntax_error, parent_context=None):
        self.node = node
        self.blocks = []
        self.parent_context = parent_context
        self._used_name_dict = {}
        self._global_names = []
        self._nonlocal_names = []
        self._nonlocal_names_in_subscopes = []
        self._add_syntax_error = add_syntax_error

    def is_async_funcdef(self):
        # Stupidly enough async funcdefs can have two different forms,
        # depending if a decorator is used or not.
        return self.is_function() \
            and self.node.parent.type in ('async_funcdef', 'async_stmt')

    def is_function(self):
        return self.node.type == 'funcdef'

    def add_name(self, name):
        parent_type = name.parent.type
        if parent_type == 'trailer':
            # We are only interested in first level names.
            return

        if parent_type == 'global_stmt':
            self._global_names.append(name)
        elif parent_type == 'nonlocal_stmt':
            self._nonlocal_names.append(name)
        else:
            self._used_name_dict.setdefault(name.value, []).append(name)

    def finalize(self):
        """
        Returns a list of nonlocal names that need to be part of that scope.
        """
        self._analyze_names(self._global_names, 'global')
        self._analyze_names(self._nonlocal_names, 'nonlocal')

        # Python2.6 doesn't have dict comprehensions.
        global_name_strs = dict((n.value, n) for n in self._global_names)
        for nonlocal_name in self._nonlocal_names:
            try:
                global_name = global_name_strs[nonlocal_name.value]
            except KeyError:
                continue

            message = "name '%s' is nonlocal and global" % global_name.value
            if global_name.start_pos < nonlocal_name.start_pos:
                error_name = global_name
            else:
                error_name = nonlocal_name
            self._add_syntax_error(message, error_name)

        nonlocals_not_handled = []
        for nonlocal_name in self._nonlocal_names_in_subscopes:
            search = nonlocal_name.value
            if search in global_name_strs or self.parent_context is None:
                message = "no binding for nonlocal '%s' found" % nonlocal_name.value
                self._add_syntax_error(message, nonlocal_name)
            elif not self.is_function() or \
                    nonlocal_name.value not in self._used_name_dict:
                nonlocals_not_handled.append(nonlocal_name)
        return self._nonlocal_names + nonlocals_not_handled

    def _analyze_names(self, globals_or_nonlocals, type_):
        def raise_(message):
            self._add_syntax_error(message % (base_name.value, type_), base_name)

        params = []
        if self.node.type == 'funcdef':
            params = self.node.params

        for base_name in globals_or_nonlocals:
            found_global_or_nonlocal = False
            # Somehow Python does it the reversed way.
            for name in reversed(self._used_name_dict.get(base_name.value, [])):
                if name.start_pos > base_name.start_pos:
                    # All following names don't have to be checked.
                    found_global_or_nonlocal = True

                parent = name.parent
                if parent.type == 'param' and parent.name == name:
                    # Skip those here, these definitions belong to the next
                    # scope.
                    continue

                if name.is_definition():
                    if parent.type == 'expr_stmt' \
                            and parent.children[1].type == 'annassign':
                        if found_global_or_nonlocal:
                            # If it's after the global the error seems to be
                            # placed there.
                            base_name = name
                        raise_("annotated name '%s' can't be %s")
                        break
                    else:
                        message = "name '%s' is assigned to before %s declaration"
                else:
                    message = "name '%s' is used prior to %s declaration"

                if not found_global_or_nonlocal:
                    raise_(message)
                    # Only add an error for the first occurence.
                    break

            for param in params:
                if param.name.value == base_name.value:
                    raise_("name '%s' is parameter and %s"),

    @contextmanager
    def add_block(self, node):
        self.blocks.append(node)
        yield
        self.blocks.pop()

    @contextmanager
    def add_context(self, node):
        new_context = Context(node, self._add_syntax_error, parent_context=self)
        yield new_context
        self._nonlocal_names_in_subscopes += new_context.finalize()


class ErrorFinder(Normalizer):
    """
    Searches for errors in the syntax tree.
    """
    def __init__(self, *args, **kwargs):
        super(ErrorFinder, self).__init__(*args, **kwargs)
        self._error_dict = {}
        self._version = self._grammar._version_int

    def initialize(self, node):
        from parso.python.tree import search_ancestor
        allowed = 'classdef', 'funcdef', 'file_input'
        if node.type in allowed:
            parent_scope = node
        else:
            parent_scope = search_ancestor(node, allowed)
        self._context = Context(parent_scope, self._add_syntax_error)
        self._indentation_count = 0

    @contextmanager
    def visit_node(self, node):
        if node.type == 'error_node':
            leaf = node.get_next_leaf()
            if node.children[-1].type == 'newline':
                # This is the beginning of a suite that is not indented.
                spacing = list(leaf._split_prefix())[-1]
                self._add_indentation_error('expected an indented block', spacing)
            else:
                if leaf.type != 'error_leaf':
                    # Error leafs will be added later as an error.
                    self._add_syntax_error("invalid syntax", leaf)
        elif node.type in _BLOCK_STMTS:
            if node.type == 'try_stmt':
                default_except = None
                for except_clause in node.children[3::3]:
                    if except_clause in ('else', 'finally'):
                        break
                    if except_clause == 'except':
                        default_except = except_clause
                    elif default_except is not None:
                        self._add_syntax_error("default 'except:' must be last", default_except)

            if node.type == 'for_stmt':
                # Some of the nodes here are already used, so no else if
                expr_list = node.children[1]
                if expr_list.type != 'expr_list':  # Already handled.
                    self._check_assignment(expr_list)

            with self._context.add_block(node):
                if len(self._context.blocks) == _MAX_BLOCK_SIZE:
                    self._add_syntax_error("too many statically nested blocks", node)
                yield
            return
        elif node.type in ('classdef', 'funcdef'):
            context = self._context
            with self._context.add_context(node) as new_context:
                self._context = new_context
                yield
                self._context = context
            return
        elif node.type == 'import_from' and _is_future_import(node):
            if not _is_future_import_first(node):
                message = "from __future__ imports must occur at the beginning of the file"
                self._add_syntax_error(message, node)

            for from_name, future_name in node.get_paths():
                name = future_name.value
                if name== 'braces':
                    message = "not a chance"
                    self._add_syntax_error(message, node)
                elif name == 'barry_as_FLUFL':
                    message = "Seriously I'm not implementing this :) ~ Dave"
                    self._add_syntax_error(message, node)
                elif name not in ALLOWED_FUTURES:
                    message = "future feature %s is not defined" % name
                    self._add_syntax_error(message, node)
        elif node.type == 'import_from':
            if node.is_star_import() and self._context.parent_context is not None:
                message = "import * only allowed at module level"
                self._add_syntax_error(message, node)
        elif node.type == 'import_as_names':
            if node.children[-1] == ',':
                # from foo import a,
                message = "trailing comma not allowed without surrounding parentheses"
                self._add_syntax_error(message, node)
        elif node.type in _STAR_EXPR_PARENTS:
            if node.parent.type == 'del_stmt':
                self._add_syntax_error("can't use starred expression here", node.parent)
            else:
                def is_definition(node, ancestor):
                    if ancestor is None:
                        return False

                    type_ = ancestor.type
                    if type_ == 'trailer':
                        return False

                    if type_ == 'expr_stmt':
                        return node.start_pos < ancestor.children[-1].start_pos

                    return is_definition(node, ancestor.parent)

                if is_definition(node, node.parent):
                    args = [c for c in node.children if c != ',']
                    starred = [c for c in args if c.type == 'star_expr']
                    if len(starred) > 1:
                        message = "two starred expressions in assignment"
                        self._add_syntax_error(message, starred[1])
                    elif starred:
                        count = args.index(starred[0])
                        if count >= 256:
                            message = "too many expressions in star-unpacking assignment"
                            self._add_syntax_error(message, starred[0])
        elif node.type == 'star_expr':
            if node.parent.type not in _STAR_EXPR_PARENTS:
                message = "starred assignment target must be in a list or tuple"
                self._add_syntax_error(message, node)
            if node.parent.type == 'testlist_comp':
                # [*[] for a in [1]]
                if node.parent.children[1].type == 'comp_for':
                    message = "iterable unpacking cannot be used in comprehension"
                    self._add_syntax_error(message, node)
        elif node.type == 'comp_for':
            # Some of the nodes here are already used, so no else if
            expr_list = node.children[1 + int(node.children[0] == 'async')]
            if expr_list.type != 'expr_list':  # Already handled.
                self._check_assignment(expr_list)

            if node.children[0] == 'async' \
                    and not self._context.is_async_funcdef():
                message = "asynchronous comprehension outside of an asynchronous function"
                self._add_syntax_error(message, node)
        elif node.type == 'arglist':
            first_arg = node.children[0]
            if first_arg.type == 'argument' \
                    and first_arg.children[1].type == 'comp_for':
                if len(node.children) >= 2:
                    # foo(x for x in [], b)
                    message = "Generator expression must be parenthesized if not sole argument"
                    self._add_syntax_error(message, node)
            else:
                arg_set = set()
                kw_only = False
                kw_unpacking_only = False
                is_old_starred = False
                # In python 3 this would be a bit easier (stars are part of
                # argument), but we have to understand both.
                for argument in node.children:
                    if argument == ',':
                        continue

                    if argument in ('*', '**'):
                        # Python < 3.5 has the order engraved in the grammar
                        # file.  No need to do anything here.
                        is_old_starred = True
                        continue
                    if is_old_starred:
                        is_old_starred = False
                        continue

                    if argument.type == 'argument':
                        first = argument.children[0]
                        if first in ('*', '**'):
                            if first == '*':
                                if kw_unpacking_only:
                                    # foo(**kwargs, *args)
                                    message = "iterable argument unpacking follows keyword argument unpacking"
                                    self._add_syntax_error(message, argument)
                            else:
                                kw_unpacking_only = True
                        else:  # Is a keyword argument.
                            kw_only = True
                            if first.type == 'name':
                                if first.value in arg_set:
                                    # f(x=1, x=2)
                                    message = "keyword argument repeated"
                                    self._add_syntax_error(message, first)
                                else:
                                    arg_set.add(first.value)
                    else:
                        if kw_unpacking_only:
                            # f(**x, y)
                            message = "positional argument follows keyword argument unpacking"
                            self._add_syntax_error(message, argument)
                        elif kw_only:
                            # f(x=2, y)
                            message = "positional argument follows keyword argument"
                            self._add_syntax_error(message, argument)
        elif node.type == 'atom':
            first = node.children[0]
            # e.g. 's' b''
            message = "cannot mix bytes and nonbytes literals"
            # TODO this check is only relevant for Python 3+
            if first.type == 'string':
                first_is_bytes = _is_bytes_literal(first)
                for string in node.children[1:]:
                    if first_is_bytes != _is_bytes_literal(string):
                        self._add_syntax_error(message, node)
                        break
        elif node.type in ('parameters', 'lambdef'):
            param_names = set()
            default_only = False
            for p in _iter_params(node):
                if p.name.value in param_names:
                    message = "duplicate argument '%s' in function definition"
                    self._add_syntax_error(message % p.name.value, p.name)
                param_names.add(p.name.value)

                if p.default is None:
                    if default_only:
                        # def f(x=3, y): pass
                        message = "non-default argument follows default argument"
                        self._add_syntax_error(message, node)
                else:
                    default_only = True
        elif node.type == 'annassign':
            # x, y: str
            type_ = None
            message = "only single target (not %s) can be annotated"
            lhs = node.parent.children[0]
            lhs = _remove_parens(lhs)
            try:
                children = lhs.children
            except AttributeError:
                pass
            else:
                if ',' in children or lhs.type == 'atom' and children[0] == '(':
                    type_ = 'tuple'
                elif lhs.type == 'atom' and children[0] == '[':
                    type_ = 'list'
                trailer = children[-1]

            if type_ is None:
                if not (lhs.type == 'name'
                        # subscript/attributes are allowed
                        or lhs.type in ('atom_expr', 'power')
                            and trailer.type == 'trailer'
                            and trailer.children[0] != '('):
                    # True: int
                    # {}: float
                    message = "illegal target for annotation"
                    self._add_syntax_error(message, lhs.parent)
            else:
                self._add_syntax_error(message % type_, lhs.parent)
        elif node.type == 'argument':
            first = node.children[0]
            if node.children[1] == '=' and first.type != 'name':
                if first.type == 'lambdef':
                    # f(lambda: 1=1)
                    message = "lambda cannot contain assignment"
                else:
                    # f(+x=1)
                    message = "keyword can't be an expression"
                self._add_syntax_error(message, first)
        elif node.type == 'nonlocal_stmt':
            if self._context.parent_context is None:
                message = "nonlocal declaration not allowed at module level"
                self._add_syntax_error(message, node)
            elif self._context.is_function():
                for nonlocal_name in node.children[1::2]:
                    param_names = [p.name.value for p in self._context.node.params]
                    if nonlocal_name.value == node:
                        pass
        elif node.type == 'expr_stmt':
            for before_equal in node.children[:-2:2]:
                self._check_assignment(before_equal)

            augassign = node.children[1]
            if augassign != '=' and augassign.type != 'annassign':  # Is augassign.
                if node.children[0].type in ('testlist_star_expr', 'atom', 'testlist'):
                    message = "illegal expression for augmented assignment"
                    self._add_syntax_error(message, node)
        elif node.type == 'with_item':
            self._check_assignment(node.children[2])
        elif node.type == 'del_stmt':
            child = node.children[1]
            if child.type != 'expr_list':  # Already handled.
                self._check_assignment(child, is_deletion=True)
        elif node.type == 'expr_list':
            for expr in node.children[::2]:
                self._check_assignment(expr)
        elif node.type == 'suite':
            self._indentation_count += 1
            if self._indentation_count == _MAX_INDENT_COUNT:
                self._add_indentation_error("too many levels of indentation", node.children[1])

        yield

    def visit_leaf(self, leaf):
        if leaf.type == 'error_leaf':
            if leaf.original_type in ('indent', 'error_dedent'):
                # Indents/Dedents itself never have a prefix. They are just
                # "pseudo" tokens that get removed by the syntax tree later.
                # Therefore in case of an error we also have to check for this.
                spacing = list(leaf.get_next_leaf()._split_prefix())[-1]
                if leaf.original_type == 'indent':
                    message = 'unexpected indent'
                else:
                    message = 'unindent does not match any outer indentation level'
                self._add_indentation_error(message, spacing)
            else:
                if leaf.value.startswith('\\'):
                    message = 'unexpected character after line continuation character'
                else:
                    match = re.match('\\w{,2}("{1,3}|\'{1,3})', leaf.value)
                    if match is None:
                        message = 'invalid syntax'
                    else:
                        if len(match.group(1)) == 1:
                            message = 'EOL while scanning string literal'
                        else:
                            message = 'EOF while scanning triple-quoted string literal'
                self._add_syntax_error(message, leaf)
        elif leaf.type == 'name':
            if leaf.value == '__debug__' and leaf.is_definition():
                if self._version < (3, 0):
                    message = 'cannot assign to __debug__'
                else:
                    message = 'assignment to keyword'
                self._add_syntax_error(message, leaf)
            if leaf.value == 'None' and self._version < (3, 0) and leaf.is_definition():
                self._add_syntax_error('cannot assign to None', leaf)

            self._context.add_name(leaf)
        elif leaf.type == 'string':
            string_prefix = leaf.string_prefix.lower()
            if 'b' in string_prefix \
                    and any(c for c in leaf.value if ord(c) > 127):
                # TODO add check for python 3
                # b'ä'
                message = "bytes can only contain ASCII literal characters."
                self._add_syntax_error(message, leaf)

            if 'r' not in string_prefix:
                # Raw strings don't need to be checked if they have proper
                # escaping.
                is_bytes = self._version < (3, 0)
                if 'b' in string_prefix:
                    is_bytes = True
                if 'u' in string_prefix:
                    is_bytes = False

                payload = leaf._get_payload()
                if is_bytes:
                    payload = payload.encode('utf-8')
                    func = codecs.escape_decode
                else:
                    func = codecs.unicode_escape_decode

                try:
                    with warnings.catch_warnings():
                        # The warnings from parsing strings are not relevant.
                        warnings.filterwarnings('ignore')
                        func(payload)
                except UnicodeDecodeError as e:
                    self._add_syntax_error('(unicode error) ' + str(e), leaf)
                except ValueError as e:
                    self._add_syntax_error('(value error) ' + str(e), leaf)

        elif leaf.value == 'continue':
            in_loop = False
            for block in self._context.blocks:
                if block.type == 'for_stmt':
                    in_loop = True
                if block.type == 'try_stmt':
                    last_block = block.children[-3]
                    if last_block == 'finally' and leaf.start_pos > last_block.start_pos:
                        message = "'continue' not supported inside 'finally' clause"
                        self._add_syntax_error(message, leaf)
            if not in_loop:
                message = "'continue' not properly in loop"
                self._add_syntax_error(message, leaf)
        elif leaf.value == 'break':
            in_loop = False
            for block in self._context.blocks:
                if block.type == 'for_stmt':
                    in_loop = True
            if not in_loop:
                self._add_syntax_error("'break' outside loop", leaf)
        elif leaf.value in ('yield', 'return'):
            if self._context.node.type != 'funcdef':
                self._add_syntax_error("'%s' outside function" % leaf.value, leaf.parent)
            elif self._context.is_async_funcdef() \
                    and any(self._context.node.iter_yield_exprs()):
                if leaf.value == 'return' and leaf.parent.type == 'return_stmt':
                    self._add_syntax_error("'return' with value in async generator", leaf.parent)
                elif leaf.value == 'yield' \
                        and leaf.get_next_leaf() != 'from' \
                        and self._version == (3, 5):
                    self._add_syntax_error("'yield' inside async function", leaf.parent)
        elif leaf.value == 'await':
            if not self._context.is_async_funcdef():
                self._add_syntax_error("'await' outside async function", leaf.parent)
        elif leaf.value == 'from' and leaf.parent.type == 'yield_arg' \
                and self._context.is_async_funcdef():
            yield_ = leaf.parent.parent
            self._add_syntax_error("'yield from' inside async function", yield_)
        elif leaf.value == '*':
            params = leaf.parent
            if params.type == 'parameters' and params:
                after = params.children[params.children.index(leaf) + 1:]
                after = [child for child in after
                         if child not in (',', ')') and not child.star_count]
                if len(after) == 0:
                    self._add_syntax_error("named arguments must follow bare *", leaf)
        elif leaf.value == '**':
            if leaf.parent.type == 'dictorsetmaker':
                comp_for = leaf.get_next_sibling().get_next_sibling()
                if comp_for is not None and comp_for.type == 'comp_for':
                    # {**{} for a in [1]}
                    message = "dict unpacking cannot be used in dict comprehension"
                    # TODO probably this should get a better end_pos including
                    # the next sibling of leaf.
                    self._add_syntax_error(message, leaf)

        return ''

    def _check_assignment(self, node, is_deletion=False):
        error = None
        type_ = node.type
        if type_ == 'lambdef':
            error = 'lambda'
        elif type_ == 'atom':
            first, second = node.children[:2]
            error = _get_comprehension_type(node)
            if error is None:
                if second.type in ('dictorsetmaker', 'string'):
                    error = 'literal'
                elif first in ('(', '['):
                    if second.type == 'yield_expr':
                        error = 'yield expression'
                    elif second.type == 'testlist_comp':
                        # This is not a comprehension, they were handled
                        # further above.
                        for child in second.children[::2]:
                            self._check_assignment(child, is_deletion)
                    else:  # Everything handled, must be useless brackets.
                        self._check_assignment(second, is_deletion)
        elif type_ == 'keyword':
            error = 'keyword'
        elif type_ == 'operator':
            if node.value == '...':
                error = 'Ellipsis'
        elif type_ == 'comparison':
            error = 'comparison'
        elif type_ in ('string', 'number'):
            error = 'literal'
        elif type_ == 'yield_expr':
            # This one seems to be a slightly different warning in Python.
            message = 'assignment to yield expression not possible'
            self._add_syntax_error(message, node)
        elif type_ == 'test':
            error = 'conditional expression'
        elif type_ in ('atom_expr', 'power'):
            if node.children[0] == 'await':
                error = 'await expression'
            elif node.children[-2] == '**':
                error = 'operator'
            else:
                # Has a trailer
                trailer = node.children[-1]
                assert trailer.type == 'trailer'
                if trailer.children[0] == '(':
                    error = 'function call'
        elif type_ in ('testlist_star_expr', 'exprlist', 'testlist'):
            for child in node.children[::2]:
                self._check_assignment(child, is_deletion)
        elif ('expr' in type_ and type_ != 'star_expr' # is a substring
              or '_test' in type_
              or type_ in ('term', 'factor')):
            error = 'operator'

        if error is not None:
            message = "can't %s %s" % ("delete" if is_deletion else "assign to", error)
            self._add_syntax_error(message, node)

    def _add_indentation_error(self, message, spacing):
        self._add_error(903, "IndentationError: " + message, spacing)

    def _add_syntax_error(self, message, node):
        self._add_error(901, "SyntaxError: " + message, node)

    def _add_error(self, code, message, node):
        # Check if the issues are on the same line.
        line = node.start_pos[0]
        args = (code, message, node)
        self._error_dict.setdefault(line, args)

    def finalize(self):
        self._context.finalize()

        for code, message, node in self._error_dict.values():
            self.issues.append(Issue(node, code, message))


class ErrorFinderConfig(NormalizerConfig):
    normalizer_class = ErrorFinder